From dbf9ad1f3ddcca7bbfa396f3fce0b8f34bb423e8 Mon Sep 17 00:00:00 2001 From: xoviat Date: Sun, 5 May 2019 13:09:39 -0500 Subject: [PATCH 001/681] tests: add windows compatibility --- ctest/CMakeLists.txt | 15 ++++++++++++--- test/CMakeLists.txt | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 14c9d1944..022379d83 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -5,9 +5,18 @@ enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +) +set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh "$1 < $2\n" ) +set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") +endif() foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) @@ -18,7 +27,7 @@ foreach(float_type ${FLOAT_TYPES}) c_${float_char}blas1.c) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") + COMMAND $) #level2 add_executable(x${float_char}cblat2 @@ -30,7 +39,7 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") #level3 add_executable(x${float_char}cblat3 @@ -42,6 +51,6 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") endforeach() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index adeee3452..25a29030a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,6 +15,20 @@ target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() # $1 exec, $2 input, $3 output_result +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"Remove-Item -Force $args[2]\n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +"If (Get-Content $args[2] | %{$_ -match \"FATAL\"}) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" "$1 < $2\n" @@ -26,14 +40,16 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() set(float_types s d c z) foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") + COMMAND $) add_test(NAME "${float_type}blas2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) endforeach() From 5163a85d4054ee377a20301831bff949505624bb Mon Sep 17 00:00:00 2001 From: xoviat Date: Sun, 5 May 2019 13:09:48 -0500 Subject: [PATCH 002/681] add gitignore directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e9d08ca7e..79e116271 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,4 @@ build.* benchmark/*.goto benchmark/smallscaling +.vscode \ No newline at end of file From 6cfd6195c5aae0813d4335863d55a0ecf7a5d3a8 Mon Sep 17 00:00:00 2001 From: xoviat Date: Sun, 5 May 2019 13:10:36 -0500 Subject: [PATCH 003/681] param: define constant as blaslong to prevent overflow --- param.h | 96 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/param.h b/param.h index 4dcd96a75..71d423831 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#include "common.h" + #ifdef OPTERON #define SNUMOPT 4 @@ -79,7 +81,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -151,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -231,7 +233,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -324,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -416,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -509,7 +511,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -601,7 +603,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -719,7 +721,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 384 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -767,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -814,7 +816,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -883,7 +885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef HAVE_SSE #define SGEMM_DEFAULT_UNROLL_M 8 @@ -938,7 +940,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef CORE_YONAH #define SGEMM_DEFAULT_UNROLL_M 4 @@ -1004,7 +1006,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 32 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1061,7 +1063,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 256 #endif -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1121,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 448 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1194,7 +1196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1265,7 +1267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1337,7 +1339,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 32 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1410,7 +1412,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1503,7 +1505,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1623,7 +1625,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1746,7 +1748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1808,7 +1810,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1862,7 +1864,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -1930,7 +1932,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -1957,7 +1959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -1988,7 +1990,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2029,7 +2031,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2065,7 +2067,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2100,7 +2102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2173,7 +2175,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2205,7 +2207,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2269,7 +2271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2301,7 +2303,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2332,7 +2334,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2368,7 +2370,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2409,7 +2411,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2450,7 +2452,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL #ifdef HAVE_MSA #define SGEMM_DEFAULT_UNROLL_M 8 @@ -2502,7 +2504,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2543,7 +2545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2584,7 +2586,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 16 @@ -2750,7 +2752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2791,7 +2793,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2832,7 +2834,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2873,7 +2875,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2912,7 +2914,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2994,7 +2996,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 From fdf71d66b3799f730bae282edf84345ccdf7c21b Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 19 Nov 2020 20:50:42 +1100 Subject: [PATCH 004/681] POWER10: Fix ld version detection LDVERSIONGTEQ35 needs to escape the '>' character. LDVERSIONGTEQ35 is checking the system ld version which may be different to the toolchain being used to compile OpenBLAS. We don't have a path to the linker in our Makefiles, so (ab)use gcc -Wl,--version to get the version of ld in our toolchain. --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index aae7ba503..6ee8beff8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -672,7 +672,7 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT From 043f3d6faa797e0fe79c165b0a31acf0cf8f2b38 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 19 Nov 2020 21:04:10 +1100 Subject: [PATCH 005/681] POWER10: Use POWER9 as a fallback If the toolchain is too old, or the mma features isn't set on a POWER10 fall back to the POWER9 loops. --- driver/others/dynamic_power.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 85fc5b3ba..d60ae68fc 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -52,6 +52,9 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; return NULL; } From 213c0e7abb6ab909479e8e956b159c040a1782f8 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 4 Dec 2020 17:07:06 -0600 Subject: [PATCH 006/681] Added special unrolled vectorized versions of "Solve" for specific sizes, in DTRSM and STRSM, to improve performance in Power9 and Power10. --- kernel/power/KERNEL.POWER10 | 18 +- kernel/power/KERNEL.POWER9 | 14 +- kernel/power/trsm_kernel_LN_power10.c | 1280 +++++++++++++++++++++++++ kernel/power/trsm_kernel_LT_power10.c | 1265 ++++++++++++++++++++++++ kernel/power/trsm_kernel_RN_power10.c | 828 ++++++++++++++++ kernel/power/trsm_kernel_RT_power10.c | 855 +++++++++++++++++ 6 files changed, 4244 insertions(+), 16 deletions(-) create mode 100644 kernel/power/trsm_kernel_LN_power10.c create mode 100644 kernel/power/trsm_kernel_LT_power10.c create mode 100644 kernel/power/trsm_kernel_RN_power10.c create mode 100644 kernel/power/trsm_kernel_RT_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c25cd9f04..d61f5194a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c + +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c +DTRSMKERNEL_LT = trsm_kernel_LT_power10.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index ab8fbfcd9..2bd2516de 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c new file mode 100644 index 000000000..5ca1603a6 --- /dev/null +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -0,0 +1,1280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + VbS7 = vec_splat(Vb[31], 1); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + c0[6] -= c0[7] * a[62]; + c1[6] -= c1[7] * a[62]; + c2[6] -= c2[7] * a[62]; + c3[6] -= c3[7] * a[62]; + c4[6] -= c4[7] * a[62]; + c5[6] -= c5[7] * a[62]; + c6[6] -= c6[7] * a[62]; + c7[6] -= c7[7] * a[62]; + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + VbS6 = vec_splat(Vb[27], 0); + VbS7 = vec_splat(Vb[27], 1); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]); + c0[4] -= c0[5] * a[44]; + c1[4] -= c1[5] * a[44]; + c2[4] -= c2[5] * a[44]; + c3[4] -= c3[5] * a[44]; + c4[4] -= c4[5] * a[44]; + c5[4] -= c5[5] * a[44]; + c6[4] -= c6[5] * a[44]; + c7[4] -= c7[5] * a[44]; + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]); + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]); + c0[2] -= c0[3] * a[26]; + c1[2] -= c1[3] * a[26]; + c2[2] -= c2[3] * a[26]; + c3[2] -= c3[3] * a[26]; + c4[2] -= c4[3] * a[26]; + c5[2] -= c5[3] * a[26]; + c6[2] -= c6[3] * a[26]; + c7[2] -= c7[3] * a[26]; + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]); + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + c0[0] -= c0[1] * a[8]; + c1[0] -= c1[1] * a[8]; + c2[0] -= c2[1] * a[8]; + c3[0] -= c3[1] * a[8]; + c4[0] -= c4[1] * a[8]; + c5[0] -= c5[1] * a[8]; + c6[0] -= c6[1] * a[8]; + c7[0] -= c7[1] * a[8]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); + VbS0 = vec_splat(Vb[30], 0); + VbS1 = vec_splat(Vb[30], 1); + VbS2 = vec_splat(Vb[30], 2); + VbS3 = vec_splat(Vb[30], 3); + VbS4 = vec_splat(Vb[31], 0); + VbS5 = vec_splat(Vb[31], 1); + VbS6 = vec_splat(Vb[31], 2); + VbS7 = vec_splat(Vb[31], 3); + Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]); + c0[12] -= b[120] * a[252]; + c0[13] -= b[120] * a[253]; + c0[14] -= b[120] * a[254]; + c1[12] -= b[121] * a[252]; + c1[13] -= b[121] * a[253]; + c1[14] -= b[121] * a[254]; + c2[12] -= b[122] * a[252]; + c2[13] -= b[122] * a[253]; + c2[14] -= b[122] * a[254]; + c3[12] -= b[123] * a[252]; + c3[13] -= b[123] * a[253]; + c3[14] -= b[123] * a[254]; + c4[12] -= b[124] * a[252]; + c4[13] -= b[124] * a[253]; + c4[14] -= b[124] * a[254]; + c5[12] -= b[125] * a[252]; + c5[13] -= b[125] * a[253]; + c5[14] -= b[125] * a[254]; + c6[12] -= b[126] * a[252]; + c6[13] -= b[126] * a[253]; + c6[14] -= b[126] * a[254]; + c7[12] -= b[127] * a[252]; + c7[13] -= b[127] * a[253]; + c7[14] -= b[127] * a[254]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[28], 2); + VbS3 = vec_splat(Vb[28], 3); + VbS4 = vec_splat(Vb[29], 0); + VbS5 = vec_splat(Vb[29], 1); + VbS6 = vec_splat(Vb[29], 2); + VbS7 = vec_splat(Vb[29], 3); + Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]); + c0[12] -= b[112] * a[236]; + c0[13] -= b[112] * a[237]; + c1[12] -= b[113] * a[236]; + c1[13] -= b[113] * a[237]; + c2[12] -= b[114] * a[236]; + c2[13] -= b[114] * a[237]; + c3[12] -= b[115] * a[236]; + c3[13] -= b[115] * a[237]; + c4[12] -= b[116] * a[236]; + c4[13] -= b[116] * a[237]; + c5[12] -= b[117] * a[236]; + c5[13] -= b[117] * a[237]; + c6[12] -= b[118] * a[236]; + c6[13] -= b[118] * a[237]; + c7[12] -= b[119] * a[236]; + c7[13] -= b[119] * a[237]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + VbS0 = vec_splat(Vb[26], 0); + VbS1 = vec_splat(Vb[26], 1); + VbS2 = vec_splat(Vb[26], 2); + VbS3 = vec_splat(Vb[26], 3); + VbS4 = vec_splat(Vb[27], 0); + VbS5 = vec_splat(Vb[27], 1); + VbS6 = vec_splat(Vb[27], 2); + VbS7 = vec_splat(Vb[27], 3); + Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]); + c0[12] -= b[104] * a[220]; + c1[12] -= b[105] * a[220]; + c2[12] -= b[106] * a[220]; + c3[12] -= b[107] * a[220]; + c4[12] -= b[108] * a[220]; + c5[12] -= b[109] * a[220]; + c6[12] -= b[110] * a[220]; + c7[12] -= b[111] * a[220]; + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[24], 2); + VbS3 = vec_splat(Vb[24], 3); + VbS4 = vec_splat(Vb[25], 0); + VbS5 = vec_splat(Vb[25], 1); + VbS6 = vec_splat(Vb[25], 2); + VbS7 = vec_splat(Vb[25], 3); + Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]); + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]); + c0[ 8] -= b[88] * a[184]; + c0[ 9] -= b[88] * a[185]; + c0[10] -= b[88] * a[186]; + c1[ 8] -= b[89] * a[184]; + c1[ 9] -= b[89] * a[185]; + c1[10] -= b[89] * a[186]; + c2[ 8] -= b[90] * a[184]; + c2[ 9] -= b[90] * a[185]; + c2[10] -= b[90] * a[186]; + c3[ 8] -= b[91] * a[184]; + c3[ 9] -= b[91] * a[185]; + c3[10] -= b[91] * a[186]; + c4[ 8] -= b[92] * a[184]; + c4[ 9] -= b[92] * a[185]; + c4[10] -= b[92] * a[186]; + c5[ 8] -= b[93] * a[184]; + c5[ 9] -= b[93] * a[185]; + c5[10] -= b[93] * a[186]; + c6[ 8] -= b[94] * a[184]; + c6[ 9] -= b[94] * a[185]; + c6[10] -= b[94] * a[186]; + c7[ 8] -= b[95] * a[184]; + c7[ 9] -= b[95] * a[185]; + c7[10] -= b[95] * a[186]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]); + c0[8] -= b[80] * a[168]; + c0[9] -= b[80] * a[169]; + c1[8] -= b[81] * a[168]; + c1[9] -= b[81] * a[169]; + c2[8] -= b[82] * a[168]; + c2[9] -= b[82] * a[169]; + c3[8] -= b[83] * a[168]; + c3[9] -= b[83] * a[169]; + c4[8] -= b[84] * a[168]; + c4[9] -= b[84] * a[169]; + c5[8] -= b[85] * a[168]; + c5[9] -= b[85] * a[169]; + c6[8] -= b[86] * a[168]; + c6[9] -= b[86] * a[169]; + c7[8] -= b[87] * a[168]; + c7[9] -= b[87] * a[169]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]); + c0[8] -= b[72] * a[152]; + c1[8] -= b[73] * a[152]; + c2[8] -= b[74] * a[152]; + c3[8] -= b[75] * a[152]; + c4[8] -= b[76] * a[152]; + c5[8] -= b[77] * a[152]; + c6[8] -= b[78] * a[152]; + c7[8] -= b[79] * a[152]; + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]); + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + c0[4] -= b[56] * a[116]; + c0[5] -= b[56] * a[117]; + c0[6] -= b[56] * a[118]; + c1[4] -= b[57] * a[116]; + c1[5] -= b[57] * a[117]; + c1[6] -= b[57] * a[118]; + c2[4] -= b[58] * a[116]; + c2[5] -= b[58] * a[117]; + c2[6] -= b[58] * a[118]; + c3[4] -= b[59] * a[116]; + c3[5] -= b[59] * a[117]; + c3[6] -= b[59] * a[118]; + c4[4] -= b[60] * a[116]; + c4[5] -= b[60] * a[117]; + c4[6] -= b[60] * a[118]; + c5[4] -= b[61] * a[116]; + c5[5] -= b[61] * a[117]; + c5[6] -= b[61] * a[118]; + c6[4] -= b[62] * a[116]; + c6[5] -= b[62] * a[117]; + c6[6] -= b[62] * a[118]; + c7[4] -= b[63] * a[116]; + c7[5] -= b[63] * a[117]; + c7[6] -= b[63] * a[118]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + c0[4] -= b[48] * a[100]; + c0[5] -= b[48] * a[101]; + c1[4] -= b[49] * a[100]; + c1[5] -= b[49] * a[101]; + c2[4] -= b[50] * a[100]; + c2[5] -= b[50] * a[101]; + c3[4] -= b[51] * a[100]; + c3[5] -= b[51] * a[101]; + c4[4] -= b[52] * a[100]; + c4[5] -= b[52] * a[101]; + c5[4] -= b[53] * a[100]; + c5[5] -= b[53] * a[101]; + c6[4] -= b[54] * a[100]; + c6[5] -= b[54] * a[101]; + c7[4] -= b[55] * a[100]; + c7[5] -= b[55] * a[101]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + c0[4] -= b[40] * a[84]; + c1[4] -= b[41] * a[84]; + c2[4] -= b[42] * a[84]; + c3[4] -= b[43] * a[84]; + c4[4] -= b[44] * a[84]; + c5[4] -= b[45] * a[84]; + c6[4] -= b[46] * a[84]; + c7[4] -= b[47] * a[84]; + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + c0[0] -= b[24] * a[48]; + c0[1] -= b[24] * a[49]; + c0[2] -= b[24] * a[50]; + c1[0] -= b[25] * a[48]; + c1[1] -= b[25] * a[49]; + c1[2] -= b[25] * a[50]; + c2[0] -= b[26] * a[48]; + c2[1] -= b[26] * a[49]; + c2[2] -= b[26] * a[50]; + c3[0] -= b[27] * a[48]; + c3[1] -= b[27] * a[49]; + c3[2] -= b[27] * a[50]; + c4[0] -= b[28] * a[48]; + c4[1] -= b[28] * a[49]; + c4[2] -= b[28] * a[50]; + c5[0] -= b[29] * a[48]; + c5[1] -= b[29] * a[49]; + c5[2] -= b[29] * a[50]; + c6[0] -= b[30] * a[48]; + c6[1] -= b[30] * a[49]; + c6[2] -= b[30] * a[50]; + c7[0] -= b[31] * a[48]; + c7[1] -= b[31] * a[49]; + c7[2] -= b[31] * a[50]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + c0[0] -= b[16] * a[32]; + c0[1] -= b[16] * a[33]; + c1[0] -= b[17] * a[32]; + c1[1] -= b[17] * a[33]; + c2[0] -= b[18] * a[32]; + c2[1] -= b[18] * a[33]; + c3[0] -= b[19] * a[32]; + c3[1] -= b[19] * a[33]; + c4[0] -= b[20] * a[32]; + c4[1] -= b[20] * a[33]; + c5[0] -= b[21] * a[32]; + c5[1] -= b[21] * a[33]; + c6[0] -= b[22] * a[32]; + c6[1] -= b[22] * a[33]; + c7[0] -= b[23] * a[32]; + c7[1] -= b[23] * a[33]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + c0[0] -= b[ 8] * a[16]; + c1[0] -= b[ 9] * a[16]; + c2[0] -= b[10] * a[16]; + c3[0] -= b[11] * a[16]; + c4[0] -= b[12] * a[16]; + c5[0] -= b[13] * a[16]; + c6[0] -= b[14] * a[16]; + c7[0] -= b[15] * a[16]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c new file mode 100644 index 000000000..14ff12fe4 --- /dev/null +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -0,0 +1,1265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[1], 0); + VbS3 = vec_splat(Vb[1], 1); + VbS4 = vec_splat(Vb[2], 0); + VbS5 = vec_splat(Vb[2], 1); + VbS6 = vec_splat(Vb[3], 0); + VbS7 = vec_splat(Vb[3], 1); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= c0[0] * a[1]; + c1[1] -= c1[0] * a[1]; + c2[1] -= c2[0] * a[1]; + c3[1] -= c3[0] * a[1]; + c4[1] -= c4[0] * a[1]; + c5[1] -= c5[0] * a[1]; + c6[1] -= c6[0] * a[1]; + c7[1] -= c7[0] * a[1]; + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[6], 0); + VbS5 = vec_splat(Vb[6], 1); + VbS6 = vec_splat(Vb[7], 0); + VbS7 = vec_splat(Vb[7], 1); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= c0[2] * a[19]; + c1[3] -= c1[2] * a[19]; + c2[3] -= c2[2] * a[19]; + c3[3] -= c3[2] * a[19]; + c4[3] -= c4[2] * a[19]; + c5[3] -= c5[2] * a[19]; + c6[3] -= c6[2] * a[19]; + c7[3] -= c7[2] * a[19]; + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= c0[4] * a[37]; + c1[5] -= c1[4] * a[37]; + c2[5] -= c2[4] * a[37]; + c3[5] -= c3[4] * a[37]; + c4[5] -= c4[4] * a[37]; + c5[5] -= c5[4] * a[37]; + c6[5] -= c6[4] * a[37]; + c7[5] -= c7[4] * a[37]; + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + c0[7] -= c0[6] * a[55]; + c1[7] -= c1[6] * a[55]; + c2[7] -= c2[6] * a[55]; + c3[7] -= c3[6] * a[55]; + c4[7] -= c4[6] * a[55]; + c5[7] -= c5[6] * a[55]; + c6[7] -= c6[6] * a[55]; + c7[7] -= c7[6] * a[55]; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= b[0] * a[ 1]; + c0[2] -= b[0] * a[ 2]; + c0[3] -= b[0] * a[ 3]; + c1[1] -= b[1] * a[ 1]; + c1[2] -= b[1] * a[ 2]; + c1[3] -= b[1] * a[ 3]; + c2[1] -= b[2] * a[ 1]; + c2[2] -= b[2] * a[ 2]; + c2[3] -= b[2] * a[ 3]; + c3[1] -= b[3] * a[ 1]; + c3[2] -= b[3] * a[ 2]; + c3[3] -= b[3] * a[ 3]; + c4[1] -= b[4] * a[ 1]; + c4[2] -= b[4] * a[ 2]; + c4[3] -= b[4] * a[ 3]; + c5[1] -= b[5] * a[ 1]; + c5[2] -= b[5] * a[ 2]; + c5[3] -= b[5] * a[ 3]; + c6[1] -= b[6] * a[ 1]; + c6[2] -= b[6] * a[ 2]; + c6[3] -= b[6] * a[ 3]; + c7[1] -= b[7] * a[ 1]; + c7[2] -= b[7] * a[ 2]; + c7[3] -= b[7] * a[ 3]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + VbS2 = vec_splat(Vb[2], 2); + VbS3 = vec_splat(Vb[2], 3); + VbS4 = vec_splat(Vb[3], 0); + VbS5 = vec_splat(Vb[3], 1); + VbS6 = vec_splat(Vb[3], 2); + VbS7 = vec_splat(Vb[3], 3); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + c0[2] -= b[ 8] * a[18]; + c0[3] -= b[ 8] * a[19]; + c1[2] -= b[ 9] * a[18]; + c1[3] -= b[ 9] * a[19]; + c2[2] -= b[10] * a[18]; + c2[3] -= b[10] * a[19]; + c3[2] -= b[11] * a[18]; + c3[3] -= b[11] * a[19]; + c4[2] -= b[12] * a[18]; + c4[3] -= b[12] * a[19]; + c5[2] -= b[13] * a[18]; + c5[3] -= b[13] * a[19]; + c6[2] -= b[14] * a[18]; + c6[3] -= b[14] * a[19]; + c7[2] -= b[15] * a[18]; + c7[3] -= b[15] * a[19]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + VbS3 = vec_splat(Vb[4], 3); + VbS4 = vec_splat(Vb[5], 0); + VbS5 = vec_splat(Vb[5], 1); + VbS6 = vec_splat(Vb[5], 2); + VbS7 = vec_splat(Vb[5], 3); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= b[16] * a[35]; + c1[3] -= b[17] * a[35]; + c2[3] -= b[18] * a[35]; + c3[3] -= b[19] * a[35]; + c4[3] -= b[20] * a[35]; + c5[3] -= b[21] * a[35]; + c6[3] -= b[22] * a[35]; + c7[3] -= b[23] * a[35]; + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + VbS6 = vec_splat(Vb[7], 2); + VbS7 = vec_splat(Vb[7], 3); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= b[32] * a[69]; + c0[6] -= b[32] * a[70]; + c0[7] -= b[32] * a[71]; + c1[5] -= b[33] * a[69]; + c1[6] -= b[33] * a[70]; + c1[7] -= b[33] * a[71]; + c2[5] -= b[34] * a[69]; + c2[6] -= b[34] * a[70]; + c2[7] -= b[34] * a[71]; + c3[5] -= b[35] * a[69]; + c3[6] -= b[35] * a[70]; + c3[7] -= b[35] * a[71]; + c4[5] -= b[36] * a[69]; + c4[6] -= b[36] * a[70]; + c4[7] -= b[36] * a[71]; + c5[5] -= b[37] * a[69]; + c5[6] -= b[37] * a[70]; + c5[7] -= b[37] * a[71]; + c6[5] -= b[38] * a[69]; + c6[6] -= b[38] * a[70]; + c6[7] -= b[38] * a[71]; + c7[5] -= b[39] * a[69]; + c7[6] -= b[39] * a[70]; + c7[7] -= b[39] * a[71]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + c0[6] -= b[40] * a[86]; + c0[7] -= b[40] * a[87]; + c1[6] -= b[41] * a[86]; + c1[7] -= b[41] * a[87]; + c2[6] -= b[42] * a[86]; + c2[7] -= b[42] * a[87]; + c3[6] -= b[43] * a[86]; + c3[7] -= b[43] * a[87]; + c4[6] -= b[44] * a[86]; + c4[7] -= b[44] * a[87]; + c5[6] -= b[45] * a[86]; + c5[7] -= b[45] * a[87]; + c6[6] -= b[46] * a[86]; + c6[7] -= b[46] * a[87]; + c7[6] -= b[47] * a[86]; + c7[7] -= b[47] * a[87]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]); + c0[7] -= b[48] * a[103]; + c1[7] -= b[49] * a[103]; + c2[7] -= b[50] * a[103]; + c3[7] -= b[51] * a[103]; + c4[7] -= b[52] * a[103]; + c5[7] -= b[53] * a[103]; + c6[7] -= b[54] * a[103]; + c7[7] -= b[55] * a[103]; + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]); + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]); + c0[ 9] -= b[64] * a[137]; + c0[10] -= b[64] * a[138]; + c0[11] -= b[64] * a[139]; + c1[ 9] -= b[65] * a[137]; + c1[10] -= b[65] * a[138]; + c1[11] -= b[65] * a[139]; + c2[ 9] -= b[66] * a[137]; + c2[10] -= b[66] * a[138]; + c2[11] -= b[66] * a[139]; + c3[ 9] -= b[67] * a[137]; + c3[10] -= b[67] * a[138]; + c3[11] -= b[67] * a[139]; + c4[ 9] -= b[68] * a[137]; + c4[10] -= b[68] * a[138]; + c4[11] -= b[68] * a[139]; + c5[ 9] -= b[69] * a[137]; + c5[10] -= b[69] * a[138]; + c5[11] -= b[69] * a[139]; + c6[ 9] -= b[70] * a[137]; + c6[10] -= b[70] * a[138]; + c6[11] -= b[70] * a[139]; + c7[ 9] -= b[71] * a[137]; + c7[10] -= b[71] * a[138]; + c7[11] -= b[71] * a[139]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]); + c0[10] -= b[72] * a[154]; + c0[11] -= b[72] * a[155]; + c1[10] -= b[73] * a[154]; + c1[11] -= b[73] * a[155]; + c2[10] -= b[74] * a[154]; + c2[11] -= b[74] * a[155]; + c3[10] -= b[75] * a[154]; + c3[11] -= b[75] * a[155]; + c4[10] -= b[76] * a[154]; + c4[11] -= b[76] * a[155]; + c5[10] -= b[77] * a[154]; + c5[11] -= b[77] * a[155]; + c6[10] -= b[78] * a[154]; + c6[11] -= b[78] * a[155]; + c7[10] -= b[79] * a[154]; + c7[11] -= b[79] * a[155]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]); + c0[11] -= b[80] * a[171]; + c1[11] -= b[81] * a[171]; + c2[11] -= b[82] * a[171]; + c3[11] -= b[83] * a[171]; + c4[11] -= b[84] * a[171]; + c5[11] -= b[85] * a[171]; + c6[11] -= b[86] * a[171]; + c7[11] -= b[87] * a[171]; + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]); + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + c0[13] -= b[ 96] * a[205]; + c0[14] -= b[ 96] * a[206]; + c0[15] -= b[ 96] * a[207]; + c1[13] -= b[ 97] * a[205]; + c1[14] -= b[ 97] * a[206]; + c1[15] -= b[ 97] * a[207]; + c2[13] -= b[ 98] * a[205]; + c2[14] -= b[ 98] * a[206]; + c2[15] -= b[ 98] * a[207]; + c3[13] -= b[ 99] * a[205]; + c3[14] -= b[ 99] * a[206]; + c3[15] -= b[ 99] * a[207]; + c4[13] -= b[100] * a[205]; + c4[14] -= b[100] * a[206]; + c4[15] -= b[100] * a[207]; + c5[13] -= b[101] * a[205]; + c5[14] -= b[101] * a[206]; + c5[15] -= b[101] * a[207]; + c6[13] -= b[102] * a[205]; + c6[14] -= b[102] * a[206]; + c6[15] -= b[102] * a[207]; + c7[13] -= b[103] * a[205]; + c7[14] -= b[103] * a[206]; + c7[15] -= b[103] * a[207]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + c0[14] -= b[104] * a[222]; + c0[15] -= b[104] * a[223]; + c1[14] -= b[105] * a[222]; + c1[15] -= b[105] * a[223]; + c2[14] -= b[106] * a[222]; + c2[15] -= b[106] * a[223]; + c3[14] -= b[107] * a[222]; + c3[15] -= b[107] * a[223]; + c4[14] -= b[108] * a[222]; + c4[15] -= b[108] * a[223]; + c5[14] -= b[109] * a[222]; + c5[15] -= b[109] * a[223]; + c6[14] -= b[110] * a[222]; + c6[15] -= b[110] * a[223]; + c7[14] -= b[111] * a[222]; + c7[15] -= b[111] * a[223]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + c0[15] -= b[112] * a[239]; + c1[15] -= b[113] * a[239]; + c2[15] -= b[114] * a[239]; + c3[15] -= b[115] * a[239]; + c4[15] -= b[116] * a[239]; + c5[15] -= b[117] * a[239]; + c6[15] -= b[118] * a[239]; + c7[15] -= b[119] * a[239]; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c new file mode 100644 index 000000000..92c26fcc3 --- /dev/null +++ b/kernel/power/trsm_kernel_RN_power10.c @@ -0,0 +1,828 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); + VbS0 = vec_splat(Vb[0], 1); + VbS1 = vec_splat(Vb[1], 0); + VbS2 = vec_splat(Vb[1], 1); + VbS3 = vec_splat(Vb[2], 0); + VbS4 = vec_splat(Vb[2], 1); + VbS5 = vec_splat(Vb[3], 0); + VbS6 = vec_splat(Vb[3], 1); + Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); + Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); + Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); + Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); + Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); + Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); + Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); + Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); + Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); + Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); + Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); + Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); + Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); + Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); + Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); + Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); + Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); + Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); + Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); + Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); + Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); + Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); + Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); + Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); + Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); + Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); + Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); + Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[5], 0); + VbS1 = vec_splat(Vb[5], 1); + VbS2 = vec_splat(Vb[6], 0); + VbS3 = vec_splat(Vb[6], 1); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); + Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); + Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); + Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); + Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); + Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); + Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); + Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); + Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); + Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); + Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); + Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); + Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); + Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); + Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); + Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); + Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); + Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); + Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); + Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); + Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); + Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); + Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); + Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[ 9], 1); + VbS1 = vec_splat(Vb[10], 0); + VbS2 = vec_splat(Vb[10], 1); + VbS3 = vec_splat(Vb[11], 0); + VbS4 = vec_splat(Vb[11], 1); + Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); + Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); + Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); + Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); + Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); + Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); + Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); + Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); + Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); + Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); + Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); + Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); + Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); + Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); + Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); + Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); + Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); + Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); + Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); + Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[15], 0); + VbS3 = vec_splat(Vb[15], 1); + Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); + Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); + Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); + Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); + Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); + Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); + Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); + Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); + Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); + Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); + Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); + Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); + Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); + Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); + Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); + Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[18], 1); + VbS1 = vec_splat(Vb[19], 0); + VbS2 = vec_splat(Vb[19], 1); + Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); + Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); + Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); + Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); + Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); + Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); + Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); + Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); + Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); + Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); + Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); + Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[23], 0); + VbS1 = vec_splat(Vb[23], 1); + Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); + Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); + Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); + Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); + Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); + Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); + Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); + Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[27], 1); + Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); + Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); + Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); + Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + + Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); + Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); + Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); + Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; + Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + + VbS0 = vec_splat(Vb[2], 1); + VbS1 = vec_splat(Vb[2], 2); + VbS2 = vec_splat(Vb[2], 3); + VbS3 = vec_splat(Vb[3], 0); + VbS4 = vec_splat(Vb[3], 1); + VbS5 = vec_splat(Vb[3], 2); + VbS6 = vec_splat(Vb[3], 3); + + Vc1[0] = vec_mul(VbS0, Vc1[0]); + Vc1[1] = vec_mul(VbS0, Vc1[1]); + Vc1[2] = vec_mul(VbS0, Vc1[2]); + Vc1[3] = vec_mul(VbS0, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); + Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); + Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); + Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); + Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); + Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); + Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); + Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); + Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); + Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); + Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); + Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); + + VbS0 = vec_splat(Vb[4], 2); + VbS1 = vec_splat(Vb[4], 3); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[5], 2); + VbS5 = vec_splat(Vb[5], 3); + + Vc2[0] = vec_mul(VbS0, Vc2[0]); + Vc2[1] = vec_mul(VbS0, Vc2[1]); + Vc2[2] = vec_mul(VbS0, Vc2[2]); + Vc2[3] = vec_mul(VbS0, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); + Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); + Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); + Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); + Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); + Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); + Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); + Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); + Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); + Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); + + VbS0 = vec_splat(Vb[6], 3); + VbS1 = vec_splat(Vb[7], 0); + VbS2 = vec_splat(Vb[7], 1); + VbS3 = vec_splat(Vb[7], 2); + VbS4 = vec_splat(Vb[7], 3); + + Vc3[0] = vec_mul(VbS0, Vc3[0]); + Vc3[1] = vec_mul(VbS0, Vc3[1]); + Vc3[2] = vec_mul(VbS0, Vc3[2]); + Vc3[3] = vec_mul(VbS0, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); + Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); + Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); + Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); + Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); + Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); + Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); + Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); + + VbS0 = vec_splat(Vb[9], 0); + VbS1 = vec_splat(Vb[9], 1); + VbS2 = vec_splat(Vb[9], 2); + VbS3 = vec_splat(Vb[9], 3); + + Vc4[0] = vec_mul(VbS0, Vc4[0]); + Vc4[1] = vec_mul(VbS0, Vc4[1]); + Vc4[2] = vec_mul(VbS0, Vc4[2]); + Vc4[3] = vec_mul(VbS0, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); + Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); + Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); + Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); + Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); + Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); + + VbS0 = vec_splat(Vb[11], 1); + VbS1 = vec_splat(Vb[11], 2); + VbS2 = vec_splat(Vb[11], 3); + + Vc5[0] = vec_mul(VbS0, Vc5[0]); + Vc5[1] = vec_mul(VbS0, Vc5[1]); + Vc5[2] = vec_mul(VbS0, Vc5[2]); + Vc5[3] = vec_mul(VbS0, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); + Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); + Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); + Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); + + VbS0 = vec_splat(Vb[13], 2); + VbS1 = vec_splat(Vb[13], 3); + + Vc6[0] = vec_mul(VbS0, Vc6[0]); + Vc6[1] = vec_mul(VbS0, Vc6[1]); + Vc6[2] = vec_mul(VbS0, Vc6[2]); + Vc6[3] = vec_mul(VbS0, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); + + VbS0 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS0, Vc7[0]); + Vc7[1] = vec_mul(VbS0, Vc7[1]); + Vc7[2] = vec_mul(VbS0, Vc7[2]); + Vc7[3] = vec_mul(VbS0, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c new file mode 100644 index 000000000..529590f37 --- /dev/null +++ b/kernel/power/trsm_kernel_RT_power10.c @@ -0,0 +1,855 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); + Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); + Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); + Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); + Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[4], 0); + Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS7, Vc7[0]); + Vc7[1] = vec_mul(VbS7, Vc7[1]); + Vc7[2] = vec_mul(VbS7, Vc7[2]); + Vc7[3] = vec_mul(VbS7, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + + Vc6[0] = vec_mul(VbS6, Vc6[0]); + Vc6[1] = vec_mul(VbS6, Vc6[1]); + Vc6[2] = vec_mul(VbS6, Vc6[2]); + Vc6[3] = vec_mul(VbS6, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + + Vc5[0] = vec_mul(VbS5, Vc5[0]); + Vc5[1] = vec_mul(VbS5, Vc5[1]); + Vc5[2] = vec_mul(VbS5, Vc5[2]); + Vc5[3] = vec_mul(VbS5, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + + Vc4[0] = vec_mul(VbS4, Vc4[0]); + Vc4[1] = vec_mul(VbS4, Vc4[1]); + Vc4[2] = vec_mul(VbS4, Vc4[2]); + Vc4[3] = vec_mul(VbS4, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + + Vc3[0] = vec_mul(VbS3, Vc3[0]); + Vc3[1] = vec_mul(VbS3, Vc3[1]); + Vc3[2] = vec_mul(VbS3, Vc3[2]); + Vc3[3] = vec_mul(VbS3, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + + Vc2[0] = vec_mul(VbS2, Vc2[0]); + Vc2[1] = vec_mul(VbS2, Vc2[1]); + Vc2[2] = vec_mul(VbS2, Vc2[2]); + Vc2[3] = vec_mul(VbS2, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + + Vc1[0] = vec_mul(VbS1, Vc1[0]); + Vc1[1] = vec_mul(VbS1, Vc1[1]); + Vc1[2] = vec_mul(VbS1, Vc1[2]); + Vc1[3] = vec_mul(VbS1, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + + VbS0 = vec_splat(Vb[0], 0); + + Vc0[0] = vec_mul(VbS0, Vc0[0]); + Vc0[1] = vec_mul(VbS0, Vc0[1]); + Vc0[2] = vec_mul(VbS0, Vc0[2]); + Vc0[3] = vec_mul(VbS0, Vc0[3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + From 47b639cc9b4ff900f7b83751af9d1c4ff9dea3c1 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 7 Dec 2020 10:04:00 +0800 Subject: [PATCH 007/681] Fix failed sswap and dswap case by using msa optimization The swap test case will call sswap_msa.c and dswap_msa.c files in MIPS environmnet. When inc_x or inc_y is equal to zero, the calculation result of the two functions will be wrong. This patch adds the processing of inc_x or inc_y equal to zero, and the swap test case has passed. --- kernel/mips/dswap_msa.c | 30 ++++++++++++++++++++++++++++-- kernel/mips/sswap_msa.c | 29 ++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c index 7b1f02477..67e97f710 100644 --- a/kernel/mips/dswap_msa.c +++ b/kernel/mips/dswap_msa.c @@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c index 46fa8aa87..d412285b0 100644 --- a/kernel/mips/sswap_msa.c +++ b/kernel/mips/sswap_msa.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } From ad38bd0e89c4507476f1ad4ba566d27bb0dd6f9d Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 7 Dec 2020 10:18:51 +0800 Subject: [PATCH 008/681] Fix failed cgemv and zgemv test case after using msa optimization The cgemv and zgemv test case will call cgemv_n/t_msa.c zgemv_n/t_msa.c files in MIPS environment. When the macro CONJ is defined, the calculation result will be wrong due to the wrong definition of OP2. This patch updates the value of OP2 and passes the corresponding test. --- kernel/mips/cgemv_n_msa.c | 4 ++-- kernel/mips/cgemv_t_msa.c | 26 +++++++++++++++++++------- kernel/mips/zgemv_n_msa.c | 4 ++-- kernel/mips/zgemv_t_msa.c | 26 +++++++++++++++++++------- 4 files changed, 42 insertions(+), 18 deletions(-) diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c index 12fa7ca02..c1eb9bbfd 100644 --- a/kernel/mips/cgemv_n_msa.c +++ b/kernel/mips/cgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c index 584e3de75..800667b6e 100644 --- a/kernel/mips/cgemv_t_msa.c +++ b/kernel/mips/cgemv_t_msa.c @@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP1 #undef OP2 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define CGEMV_T_8x4() \ diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c index 669c25758..97a80b4ba 100644 --- a/kernel/mips/zgemv_n_msa.c +++ b/kernel/mips/zgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c index e6febb577..6492f90be 100644 --- a/kernel/mips/zgemv_t_msa.c +++ b/kernel/mips/zgemv_t_msa.c @@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP3 #undef OP4 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define ZGEMV_T_8x1() \ From d67babf34536ffd0cba4142aa1ea4496394438cd Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 8 Dec 2020 19:16:39 +0800 Subject: [PATCH 009/681] Remove gcc unrecognized option '-msched-weight' when check msa --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index fe9c53f0e..970d475d7 100644 --- a/c_check +++ b/c_check @@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } else { $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; From 5d26223f4a91e14ec711168f6e4a40f21729be38 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 20:59:56 +0100 Subject: [PATCH 010/681] remove extra/intermediate size step of min_jj from PR747 --- driver/level3/level3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index a38506585..9b44deb85 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; +/* + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif From a5547124393a3ea7538998e98356cb052dc652d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 21:01:36 +0100 Subject: [PATCH 011/681] remove extra/intermediate size step for min_jj introduced in PR747 --- driver/level3/level3_thread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6e1fd9e99..2b33c9589 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else +/* if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif /* Copy part of local region of B into workspace */ From d71fe4ed4eff491a9e6aae87fbd46cf9d2914d9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Dec 2020 21:07:57 +0100 Subject: [PATCH 012/681] Remove GEMM_DEFAULT_UNROLL_MN parameters for Haswell and ZEN (introduced in PR747) --- param.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index a0d45c573..42f63b4b5 100644 --- a/param.h +++ b/param.h @@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 From 4b548857d64e6f0fb3aefbd0bd5bd4d14f2a22d7 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 26 Nov 2020 14:59:41 +0800 Subject: [PATCH 013/681] Add msa support for loongson 1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1 --- Makefile.system | 27 +- common_linux.h | 8 - common_mips64.h | 9 +- cpuid_mips64.c | 91 +++---- driver/others/Makefile | 8 + driver/others/blas_server.c | 2 + driver/others/dynamic_mips64.c | 230 ++++++++++++++++++ driver/others/parameter.c | 16 +- getarch.c | 24 +- kernel/Makefile | 5 + kernel/Makefile.L3 | 4 - kernel/mips/cgemm_kernel_8x4_msa.c | 4 +- kernel/mips/crot_msa.c | 6 +- kernel/mips/cscal_msa.c | 6 +- kernel/mips/dscal_msa.c | 4 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 38 +-- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 36 +-- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 21 +- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 21 +- kernel/mips/macros_msa.h | 8 +- kernel/mips/srot_msa.c | 6 +- kernel/mips/sscal_msa.c | 6 +- kernel/mips/zscal_msa.c | 8 +- kernel/mips64/KERNEL.LOONGSON3B | 64 ----- .../{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} | 27 +- kernel/mips64/KERNEL.LOONGSON3R4 | 192 +++++++++++++++ kernel/setparam-ref.c | 72 ++++++ param.h | 48 ++-- 28 files changed, 656 insertions(+), 335 deletions(-) create mode 100644 driver/others/dynamic_mips64.c delete mode 100644 kernel/mips64/KERNEL.LOONGSON3B rename kernel/mips64/{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} (75%) create mode 100644 kernel/mips64/KERNEL.LOONGSON3R4 diff --git a/Makefile.system b/Makefile.system index c17cd3bd1..6377f66ea 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 endif +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1223,10 +1222,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1342,11 +1339,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..674b65908 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -140,14 +105,16 @@ int detect(void){ } fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; } #endif return CPU_UNKNOWN; + } } char *get_corename(void){ @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -237,10 +204,10 @@ void get_cpuconfig(void){ } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { diff --git a/driver/others/Makefile b/driver/others/Makefile index d09444f56..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -24,10 +24,14 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -92,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..5e0943c2e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..36da13369 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -717,7 +717,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +731,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/getarch.c b/getarch.c index 9344defb5..e59a4e9b7 100644 --- a/getarch.c +++ b/getarch.c @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#ifdef FORCE_LOONGSON3R3 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" #else #endif diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..4e86546b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -68,6 +70,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 893713769..d8d739965 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) -USE_TRMM = 1 -endif - ifneq ($(DYNAMIC_ARCH), 1) ifeq ($(TARGET), GENERIC) USE_TRMM = 1 diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 8b624be88..aa3f1dcfa 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c index 5273e38a3..84eb54d6d 100644 --- a/kernel/mips/crot_msa.c +++ b/kernel/mips/crot_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 2 elements */ for (j = (n >> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d0317a745..1e846a61c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,77 @@ static void init_parameter(void) { } #else // (ARCH_ARM64) +#if defined(ARCH_MIPS64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = 640; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif +} +#else // (ARCH_MIPS64) #if (ARCH_POWER) static void init_parameter(void) { @@ -1780,4 +1851,5 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH +#endif //(ARCH_MIPS64) #endif //(ARCH_ARM64) diff --git a/param.h b/param.h index a0d45c573..6946c2b41 100644 --- a/param.h +++ b/param.h @@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A -/*Copy from SICORTEX*/ +#if defined(LOONGSON3R4) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#endif #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 44 @@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B +#if defined(LOONGSON3R3) +////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 92 #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 +#define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 From be24c66a7c3b746dd9c27db09e4b0e28785025f2 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 10 Dec 2020 10:48:53 +0800 Subject: [PATCH 014/681] Keep LOONGSON3A and LOONGSON3B for loongson --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index e59a4e9b7..29671736e 100644 --- a/getarch.c +++ b/getarch.c @@ -814,7 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3R3 +#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" #define SUBARCHITECTURE "LOONGSON3R3" From 346e30a46a4758eb4d9b8e5783c0b9c3c6b3ce6f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 10 Dec 2020 11:51:42 -0600 Subject: [PATCH 015/681] POWER10: Improve axpy performance This patch aligns the stores to 32 byte boundary for saxpy and daxpy before entering into vector pair loop. Fox caxpy, changed the store instructions to stxv to improve performance of unaligned cases. --- kernel/power/caxpy_microk_power10.c | 24 ++++++++++++++++-------- kernel/power/daxpy_power10.c | 17 ++++++++++++----- kernel/power/saxpy_power10.c | 14 ++++++++++---- 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..56a5ab47a 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); + + i += n1; - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); - - i = n1; while(i < n) { diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) { From 6232237dba7bdd7e185216f7bb0d733ba4c0486e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 11 Dec 2020 23:41:17 +0100 Subject: [PATCH 016/681] Make fallback from P10 to P9 conditional on suitable compiler --- driver/others/dynamic_power.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d60ae68fc..a2f56d839 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -53,8 +53,10 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; +#endif return NULL; } From 77edf82c7faf9af1412b0f0c9de7a7543341b2e2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 01:25:20 +0100 Subject: [PATCH 017/681] Update Changelog.txt for 0.3.13 --- Changelog.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index edd3563ec..807c5ff20 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.13 + 12-Dec-2020 + + common: + * Added a generic bfloat16 SBGEMV kernel + * Fixed a potentially severe memory leak after fork in OpenMP builds + that was introduces in 0.3.12 + * Added detection of the Fujitsu Fortran compiler + * Added detection of the (e)gfortran compiler on OpenBSD + * Added support for overriding the default name of the library independently + from symbol suffixing in the gmake builds (already supported in cmake) + +RISCV: + * Added a RISC V port optimized for C910V + +POWER: + * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N + * Improved DGEMM performance on POWER10 + * Improved STRSM and DTRSM performance on POWER9 and POWER10 + * Fixed segmemtation faults in DYNAMIC_ARCH builds + * Fixed compilation with the PGI compiler + +x86: + * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 + +x86_64: + * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake + * Improved the performance of SASUM and DASUM kernels through parallelization + * Improved the performance of SROT and DROT kernels + * Improved the performance of multithreaded xSYRK + * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran + (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or + wrong results) + * Fixed miscompilations by old gcc 4.6 + * Fixed misdetection of AVX2 capability in some Sandybridge cpus + * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD + +ARM64: + * Fixed segmemtation faults in DYNAMIC_ARCH builds + +MIPS: + * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA + * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV + * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP + * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) + +SPARC: + * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers + ==================================================================== Version 0.3.12 24-Oct-2020 From 3dec81200cdac01651681a3e36f77179a0815eb4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 14:27:37 +0100 Subject: [PATCH 018/681] Update Changelog.txt Co-authored-by: h-vetinari --- Changelog.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 807c5ff20..cbc7007ac 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -6,7 +6,7 @@ Version 0.3.13 common: * Added a generic bfloat16 SBGEMV kernel * Fixed a potentially severe memory leak after fork in OpenMP builds - that was introduces in 0.3.12 + that was introduced in 0.3.12 * Added detection of the Fujitsu Fortran compiler * Added detection of the (e)gfortran compiler on OpenBSD * Added support for overriding the default name of the library independently From d3ec787f774bc678ec13f0ed87fe2f3d67af1a11 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:14:49 +0100 Subject: [PATCH 019/681] Update version to 0.3.13 for release --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1a0965d08..e4b82104e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12.dev +VERSION = 0.3.13 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 7bc0e4a2e001117d7e51f0ef8ea1abc4b734d079 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 18:15:33 +0100 Subject: [PATCH 020/681] Update version to 0.3.13 for release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aeb4399e4..12730e0e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12.dev) +set(OpenBLAS_PATCH_VERSION 13) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 9031ebd7d50d903ad2372001f4d20908f0c0bf20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 23:28:20 +0100 Subject: [PATCH 021/681] Update version to 0.3.13.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12730e0e3..c5ba3ceed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 13) +set(OpenBLAS_PATCH_VERSION 13.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 87315e8a8d1f27684d886c31742d95d98886aa8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 12 Dec 2020 23:28:49 +0100 Subject: [PATCH 022/681] Update version to 0.3.13.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index e4b82104e..c68c20923 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.13 +VERSION = 0.3.13.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From ad63647446b88f747a058a65f375372434c8f2b0 Mon Sep 17 00:00:00 2001 From: Joshie Date: Sun, 13 Dec 2020 09:06:14 +0000 Subject: [PATCH 023/681] Define BLAS acronym in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 267df5358..6c6322c32 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . From 2fb11f873bfb5d690cbe096d81a837ede4cfa63f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 13 Dec 2020 10:41:45 -0600 Subject: [PATCH 024/681] POWER10: Improve copy performance This patch aligns the stores to 32 byte boundary for scopy and dcopy before entering into vector pair loop. For ccopy, changed the store instructions to stxv to improve performance of unaligned cases. --- kernel/power/ccopy_microk_power10.c | 115 ++++++++++++++++++++++++++++ kernel/power/ccopy_power10.c | 4 +- kernel/power/copy_microk_power10.c | 25 +++--- kernel/power/dcopy_power10.c | 16 ++-- kernel/power/scopy_power10.c | 15 +++- 5 files changed, 152 insertions(+), 23 deletions(-) create mode 100644 kernel/power/ccopy_microk_power10.c diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c new file mode 100644 index 000000000..6c80f9cd4 --- /dev/null +++ b/kernel/power/ccopy_microk_power10.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL 1 + +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "addi %3, %3, 256 \n\t" + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c index a5877cd12..41c510460 100644 --- a/kernel/power/ccopy_power10.c +++ b/kernel/power/ccopy_power10.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "copy_microk_power10.c" +#include "ccopy_microk_power10.c" #endif #ifndef HAVE_KERNEL @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { copy_kernel(n1, x, y); diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c index c90dc3785..8bca1a1e7 100644 --- a/kernel/power/copy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "one%=: \n\t" "stxvp 32, 0(%3) \n\t" - "lxvp 32, 0(%2) \n\t" "stxvp 34, 32(%3) \n\t" - "lxvp 34, 32(%2) \n\t" "stxvp 36, 64(%3) \n\t" - "lxvp 36, 64(%2) \n\t" "stxvp 38, 96(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" "stxvp 40, 128(%3) \n\t" - "lxvp 40, 128(%2) \n\t" "stxvp 42, 160(%3) \n\t" - "lxvp 42, 160(%2) \n\t" "stxvp 44, 192(%3) \n\t" - "lxvp 44, 192(%2) \n\t" "stxvp 46, 224(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" "lxvp 46, 224(%2) \n\t" "stxvp 48, 256(%3) \n\t" - "lxvp 48, 256(%2) \n\t" "stxvp 50, 288(%3) \n\t" - "lxvp 50, 288(%2) \n\t" "stxvp 52, 320(%3) \n\t" - "lxvp 52, 320(%2) \n\t" "stxvp 54, 352(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" - "lxvp 56, 384(%2) \n\t" "stxvp 58, 416(%3) \n\t" - "lxvp 58, 416(%2) \n\t" "stxvp 60, 448(%3) \n\t" - "lxvp 60, 448(%2) \n\t" "stxvp 62, 480(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" "lxvp 62, 480(%2) \n\t" "addi %3, %3, 512 \n\t" diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index cd10b7136..6c5eb4d77 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - - BLASLONG n1 = n & -64; - if ( n1 > 0 ) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 ) { - copy_kernel(n1, x, y); - i=n1; + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c index 298a8998a..3398ce827 100644 --- a/kernel/power/scopy_power10.c +++ b/kernel/power/scopy_power10.c @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -128; - if ( n1 > 0 ) + if ( n >= 128 ) { - copy_kernel (n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -128; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) From 00ce35336ee1eb1089f30d1e117a8a6a933f9654 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Dec 2020 21:28:01 +0100 Subject: [PATCH 025/681] Fix spurious removal of a trailing character from the hostarch string on x86_64 --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 970d475d7..9c8b1abac 100644 --- a/c_check +++ b/c_check @@ -5,7 +5,7 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); -$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -m | sed -e s/i.86/x86/`; $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); From b03dc011be97b1a841aff6aa644e51a223cb404b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Dec 2020 19:21:52 +0100 Subject: [PATCH 026/681] Fix undefined CC variable in clang check --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 42241ae10..d20b96081 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { $flags = "-lomp"; } From 0f7776af0b65134d18cdc0935b8591441741853b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Dec 2020 22:30:36 +0100 Subject: [PATCH 027/681] Add Intel Rocket Lake --- cpuid_x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 84c12ff43..aca37da45 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1436,6 +1436,15 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -2014,6 +2023,19 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; } case 5: switch (model) { From 865676682dc0c249fc89ec5713bb9695df277ff2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Dec 2020 22:40:23 +0100 Subject: [PATCH 028/681] Add Intel Rocket Lake --- driver/others/dynamic.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 58f4d8b59..7845d6951 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ } } case 10: - if (model == 5 || model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { @@ -666,7 +666,20 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } - } + } + if (model == 7) { + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: From abef2ea770ce54349195506db84a3d64f65676a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Dec 2020 11:32:27 +0100 Subject: [PATCH 029/681] Move -fma option setting to kernel/Makefile.L1 --- Makefile.x86_64 | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 00967bcb6..175db823d 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -32,12 +32,6 @@ CCOMMON_OPT += -mavx2 FCOMMON_OPT += -mavx2 endif endif -ifndef OLDGCC -ifdef HAVE_FMA3 -CCOMMON_OPT += -mfma -FCOMMON_OPT += -mfma -endif -endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH From c73d8ee40ddd9c3f2cc311b7c45955a234a563c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Dec 2020 11:34:05 +0100 Subject: [PATCH 030/681] Conditionally add -mfma to compiler options where needed --- kernel/Makefile.L1 | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 7ad94118a..09337363d 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -1,3 +1,11 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + + ### AMAX ### ifndef SAMAXKERNEL @@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ From e40416567a1f58414a7221a0f013109b681307fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 22:06:56 +0100 Subject: [PATCH 031/681] Add version printout for PGI/NVIDIA compiler --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 54dd3be41..de0735c4a 100644 --- a/Makefile +++ b/Makefile @@ -59,6 +59,9 @@ endif @$(CC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ cverinfo=`$(CC) --version | sed -n '1p'`; \ + if [ -z "$${cverinfo}" ]; then \ + cverinfo=`$(CC) --version | sed -n '2p'`; \ + fi; \ echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ else \ echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(FC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ fverinfo=`$(FC) --version | sed -n '1p'`; \ + if [ -z "$${fverinfo}" ]; then \ + fverinfo=`$(FC) --version | sed -n '2p'`; \ + fi; \ echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ else \ echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ From b212a2fb9f956b56f7a55d9019f61ffa8bb56092 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 22:08:37 +0100 Subject: [PATCH 032/681] Add/modify "PGI" compiler options for NVIDIA SDK 20.11 --- Makefile.system | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5adde36d8..45d02ba5c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -181,7 +181,7 @@ endif # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(HOSTARCH), x86_64) -ifeq ($(findstring pgcc,$(HOSTCC)),) +ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif @@ -847,9 +847,19 @@ endif endif ifeq ($(C_COMPILER), PGI) +PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) +PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) +PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) +NEWPGI := 1 +endif ifdef BINARY64 ifeq ($(ARCH), x86_64) -CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm +CCOMMON_OPT += -tp p7-64 +ifneq ($(NEWPGI),1) +CCOMMON_OPT += -D__MMX__ -Mnollvm +endif else ifeq ($(ARCH), power) ifeq ($(CORE), POWER8) @@ -1040,7 +1050,7 @@ endif else FCOMMON_OPT += -tp p7 endif -FCOMMON_OPT += -Mrecursive +FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif From b859b6e79dc16907c4fd614a9857cc97e66f05ff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 22:09:57 +0100 Subject: [PATCH 033/681] Add nvfortran --- f_check | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/f_check b/f_check index d20b96081..e9aca4ff9 100644 --- a/f_check +++ b/f_check @@ -32,7 +32,7 @@ if ($compiler eq "") { "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", - "pgf95", "pgf90", "pgf77", + "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", "flang", "egfortran", "ifort"); @@ -64,7 +64,6 @@ if ($compiler eq "") { if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; - if ($data =~ /zhoge_/) { $bu = "_"; } @@ -87,7 +86,7 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; - } elsif ($compiler =~ /pgf/) { + } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $openmp = "-mp"; } else { @@ -123,7 +122,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /PGF/) { + if ($data =~ /PGF/ || $data =~ /NVF/) { $vendor = PGI; $openmp = "-mp"; } @@ -177,7 +176,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($compiler =~ /pgf/) { + if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; @@ -330,7 +329,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { $flags = "-lomp"; } From 005cce5507c39b70ba040cd9c44a54bef17368c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 22:11:49 +0100 Subject: [PATCH 034/681] Amend SkylakeX options to support the NVIDIA compiler --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 4e86546b9..1a6c9413f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else - override CFLAGS += -march=skylake-avx512 + override CFLAGS += -march=skylake-avx512 -mavx512f endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) endif endif else ifeq ($(TARGET_CORE), SKYLAKEX) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif From 114eb159a4b0d83a76ab837952516e7fadc21a30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 22:15:58 +0100 Subject: [PATCH 035/681] Disable FMA intrinsics in the srot kernel when the compiler is PGI/NVIDIA --- kernel/x86_64/srot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 3de586cb8..3264d251a 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; -#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) +#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; From 75b1f3becc236f269a332e6233f2eab35d46f683 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 23:17:40 +0100 Subject: [PATCH 036/681] Limit POWERPC DYNAMIC_CORE list to P8 and P9 for NVIDIA compilers --- Makefile.system | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Makefile.system b/Makefile.system index 45d02ba5c..ce3a819a8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -663,6 +663,7 @@ endif endif # ARCH zarch ifeq ($(ARCH), power) +ifneq ($(C_COMPILER), PGI) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) @@ -689,6 +690,10 @@ else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif endif +else +DYNAMIC_CORE = POWER8 +DYNAMIC_CORE += POWER9 +endif endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty @@ -1039,12 +1044,18 @@ ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else ifeq ($(ARCH), power) +ifeq ($(CORE), POWER6) +$(warning NVIDIA HPC compilers do not support POWER6.) +endif ifeq ($(CORE), POWER8) FCOMMON_OPT += -tp pwr8 endif ifeq ($(CORE), POWER9) FCOMMON_OPT += -tp pwr9 endif +ifeq ($(CORE), POWER10) +$(warning NVIDIA HPC compilers do not support POWER10.) +endif endif endif else From 91c3f86c2bc47a8ebecbcea8af5cca6e38d5295b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 23:19:05 +0100 Subject: [PATCH 037/681] NVIDIA compiler does not yet support POWER10 --- Makefile.power | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.power b/Makefile.power index c7e972290..946f55232 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,9 +10,11 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) From 17c16f2a71cf957f4a4c74050da0825f6ebe203f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Dec 2020 23:21:22 +0100 Subject: [PATCH 038/681] Implement builtin_cpu_is and limit cpu choices to P8 and P9 for NVIDIA compilers --- driver/others/dynamic_power.c | 151 ++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index a2f56d839..f9feeb6e8 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -27,7 +27,9 @@ static char *corename[] = { #define NUM_CORETYPES 4 char *gotoblas_corename(void) { +#ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; +#endif if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; @@ -38,10 +40,157 @@ char *gotoblas_corename(void) { return corename[0]; } +#ifdef C_PGI +/* + * NV HPC compilers do not yet implement __builtin_cpu_is(). + * Fake a version here for use in the CPU detection code below. + * + * Strategy here is to first check the CPU to see what it actually is, + * and then test the input to see if what the CPU actually is matches + * what was requested. + */ + +#include + +/* + * Define POWER processor version table. + * + * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time + */ + +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +static struct { + uint32_t pvr_mask; + uint32_t pvr_value; + const char* cpu_name; + uint32_t cpu_type; +} pvrPOWER [] = { + + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_type = CPU_POWER5, + }, + + { /* Power6 aka POWER6X*/ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD2.2 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_type = CPU_POWER10, + }, + + { /* End of table, pvr_mask and pvr_value must be zero */ + .pvr_mask = 0x0, + .pvr_value = 0x0, + .cpu_name = "Unknown", + .cpu_type = CPU_UNKNOWN, + }, +}; + +static int __builtin_cpu_is(const char *cpu) { + int i; + uint32_t pvr; + uint32_t cpu_type; + + asm("mfpvr %0" : "=r"(pvr)); + + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } + +#if defined(DEBUG) + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); +#endif + cpu_type = pvrPOWER[i].cpu_type; + + if (!strcmp(cpu, "power8")) + return cpu_type == CPU_POWER8; + if (!strcmp(cpu, "power9")) + return cpu_type == CPU_POWER9; + return 0; +} + +#endif /* C_PGI */ + static gotoblas_t *get_coretype(void) { +#ifndef C_PGI if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) return &gotoblas_POWER6; +#endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) @@ -77,7 +226,9 @@ static gotoblas_t *force_coretype(char * coretype) { switch (found) { +#ifndef C_PGI case 1: return (&gotoblas_POWER6); +#endif case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); From 6f4698ee1fda9b569ed51c214dc51aed4774b21a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Dec 2020 07:41:18 +0100 Subject: [PATCH 039/681] Temporarily revert to the old nrm2 kernel --- kernel/arm64/KERNEL.NEOVERSEN1 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index ea010db42..074d72153 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c From 2768bc1764fe61fcebb6a0e5f906811f7460ed07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Dec 2020 07:42:51 +0100 Subject: [PATCH 040/681] Temporarily revert to the old nrm2 kernels --- kernel/arm64/KERNEL.THUNDERX2T99 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index a20d0d4a6..8333f60e6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +CNRM2KERNEL = nrm2.S #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +DNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c From 8631e2976a01d074b207db0c58618c01c9998d35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Dec 2020 07:45:13 +0100 Subject: [PATCH 041/681] Temporarily revert to the old nrm2 kernels --- kernel/arm64/KERNEL.THUNDERX3T110 | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index a20d0d4a6..4cdd8769f 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -153,13 +153,16 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - +#SNRM2KERNEL = scnrm2_thunderx2t99.c +#CNRM2KERNEL = scnrm2_thunderx2t99.c +##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#DNRM2KERNEL = dznrm2_thunderx2t99.c +#ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c From 9a38592c79ee4e4b3a38e18092e880e4e92481c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Dec 2020 21:55:08 +0100 Subject: [PATCH 042/681] Add pointers to the netlib documentation and Gilbert Strang's linear algebra primers --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c6322c32..fed3936ee 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,14 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized Basic Linear Algebra Subprograms library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . +For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: +. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. + ## Binary Packages We provide official binary packages for the following platform: From 0a535e58d857cb3b6d2cd73db7b4197c64c82836 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 29 Dec 2020 12:06:39 +0000 Subject: [PATCH 043/681] getarch.c: define OPENBLAS_SUPPORTED for riscv64 --- getarch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/getarch.c b/getarch.c index 29671736e..f48944f36 100644 --- a/getarch.c +++ b/getarch.c @@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __riscv #include "cpuid_riscv64.c" +#define OPENBLAS_SUPPORTED #endif #ifdef __arm__ From 1b2508362b9033468eb98ea4146e31ab50d14fa3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 1 Jan 2021 02:09:40 -0800 Subject: [PATCH 044/681] arm64: Fix nrm2 for input vectors with Inf Fix double precision nrm2 kernels returning NaN when the input vectors contain Inf/-Inf. --- kernel/arm64/KERNEL.NEOVERSEN1 | 8 ++++---- kernel/arm64/KERNEL.THUNDERX2T99 | 8 ++++---- kernel/arm64/KERNEL.THUNDERX3T110 | 17 +++++++---------- kernel/arm64/dznrm2_thunderx2t99.c | 28 +++++++++++++++++++++++++++- 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index 074d72153..ea010db42 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99 index 8333f60e6..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX2T99 +++ b/kernel/arm64/KERNEL.THUNDERX2T99 @@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = nrm2.S -CNRM2KERNEL = nrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -DNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c DDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 index 4cdd8769f..a20d0d4a6 100644 --- a/kernel/arm64/KERNEL.THUNDERX3T110 +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -153,16 +153,13 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -#SNRM2KERNEL = scnrm2_thunderx2t99.c -#CNRM2KERNEL = scnrm2_thunderx2t99.c -##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c -##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c -#DNRM2KERNEL = dznrm2_thunderx2t99.c -#ZNRM2KERNEL = dznrm2_thunderx2t99.c -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + DDOTKERNEL = dot_thunderx2t99.c SDOTKERNEL = dot_thunderx2t99.c diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b94f0cffc..b021a2832 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #define CUR_MAXINV "d8" #define CUR_MAXINV_V "v8.2d" #define CUR_MAX_V "v8.2d" +#define REGINF "d9" static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, double *ssq, double *scale) @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ble 9f //nrm2_kernel_L999 \n" "1: //nrm2_kernel_F_BEGIN: \n" + " mov x6, #0x7FF0000000000000 //+Infinity \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" + " fmov "REGINF", x6 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" + " b 11f \n" + "10: \n" + " str "REGINF", [%[SSQ_]] \n" + " str "REGINF", [%[SCALE_]] \n" + "11: \n" : : [SSQ_] "r" (ssq), //%0 @@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, [INCX_] "r" (inc_x) //%4 : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" ); @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) cur_ssq = *ptr; cur_scale = *(ptr + 1); + if (cur_ssq == INFINITY) { + ssq = INFINITY; + scale = INFINITY; + break; + } + if (cur_scale != 0) { if (cur_scale > scale) { scale = (scale / cur_scale); From 7aa1ff8ff6d3f151292eeb86c629e4077b867ae0 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Fri, 1 Jan 2021 21:19:57 +0000 Subject: [PATCH 045/681] Fix build on FreeBSD/powerpc64le --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index ce3a819a8..ca0879fe6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc64le) +override ARCH=power else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) From 601b711c78a4a652820edacc16c6791a7f120c7d Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 8 Jan 2021 08:01:36 -0600 Subject: [PATCH 046/681] Optimize swap function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/cswap.c | 4 +- kernel/power/cswap_microk_power10.c | 127 ++++++++++++++++++++++++++++ kernel/power/dswap.c | 22 ++++- kernel/power/sswap.c | 22 ++++- kernel/power/swap_microk_power10.c | 105 +++++++++++++++++++++++ kernel/power/zswap.c | 4 +- 6 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 kernel/power/cswap_microk_power10.c create mode 100644 kernel/power/swap_microk_power10.c diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 5144a2e93..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c new file mode 100644 index 000000000..2a44a9e30 --- /dev/null +++ b/kernel/power/cswap_microk_power10.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(DOUBLE) +#define HAVE_KERNEL_16 1 +static void zswap_kernel_16 (long n, double *x, double *y) +#else +#define HAVE_KERNEL_32 1 +static void cswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 41, 0(%3) \n\t" + "stxv 40, 16(%3) \n\t" + "stxv 43, 32(%3) \n\t" + "stxv 42, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 53, 64(%4) \n\t" + "stxv 52, 80(%4) \n\t" + "stxv 55, 96(%4) \n\t" + "stxv 54, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + + "stxv 57, 0(%4) \n\t" + "stxv 56, 16(%4) \n\t" + "stxv 59, 32(%4) \n\t" + "stxv 58, 48(%4) \n\t" + "stxv 61, 64(%4) \n\t" + "stxv 60, 80(%4) \n\t" + "stxv 63, 96(%4) \n\t" + "stxv 62, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -16 \n\t" +#else + "addic. %2, %2, -32 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index ff3f95c79..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { dswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 44522f0a0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 > 0 ) + { + sswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c new file mode 100644 index 000000000..f9c1fee52 --- /dev/null +++ b/kernel/power/swap_microk_power10.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define HAVE_KERNEL_32 1 + +#if defined(DOUBLE) +static void dswap_kernel_32 (long n, double *x, double *y) +#else +static void sswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 52, 64(%4) \n\t" + "stxvp 54, 96(%4) \n\t" + + "stxvp 56, 128(%4) \n\t" + "stxvp 58, 160(%4) \n\t" + "stxvp 60, 192(%4) \n\t" + "stxvp 62, 224(%4) \n\t" + + "addi %4, %4, 256 \n\t" + "addi %3, %3, 256 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -32 \n\t" +#else + "addic. %2, %2, -64 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 3a5a8eb83..6cd3d9664 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif From b0beb0b1ca6469286dd69cdbeeb2c79d96ac66d0 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Mon, 11 Jan 2021 02:15:21 +0800 Subject: [PATCH 047/681] Initial code for Cooperlake BF16 GEMM kernel --- .../x86_64/sbgemm_block_microk_cooperlake.c | 426 ++++++++++++ .../sbgemm_microk_cooperlake_template.c | 625 ++++++++++++++++++ 2 files changed, 1051 insertions(+) create mode 100644 kernel/x86_64/sbgemm_block_microk_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_microk_cooperlake_template.c diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c new file mode 100644 index 000000000..2376fed02 --- /dev/null +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -0,0 +1,426 @@ +#include "sbgemm.h" + +#include +// Walk around those intrinsics that missed by compiler +#define MM256_LOADU_EPI16(addr) \ + _mm256_maskz_loadu_epi16(~0, (addr)) +#define MM256_STOREU_EPI16(addr, reg) \ + _mm256_mask_storeu_epi16((addr), ~0, (reg)) + +#include +void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) +{ + printf("---- BLOCK %ld x %ld ----\n", m, n); + for (BLASLONG i=0; i> (32-m)); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + __m512i array512_0, array512_1, array512_2, array512_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*lda; + BLASLONG BF16_BLOCK_T_M_2x = 2*32; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + idx_target_base1 = 32; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); + array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); + _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); + _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m512i ZERO512 = _mm512_setzero_si512(); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); + array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); + _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); + _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + BLASLONG tag_k_2x = k & (~1); + + __m256i array256_0, array256_1, array256_2, array256_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0; + + BLASLONG LDA_2x = 2*lda; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); + array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); + array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += 32; + } + + if (tag_k_2x != k) { + __m256i ZERO256 = _mm256_setzero_si256(); + array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); + array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m256i array256_0, array256_1, array256_2, array256_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0; + + BLASLONG LDA_2x = 2*lda; + idx_src_base0 = 0; + idx_src_base1 = lda; + idx_target_base0 = 0; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); + array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += 32; + } + + if (tag_k_2x != k) { + __m256i ZERO256 = _mm256_setzero_si256(); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); + array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); + // Store in one row of block_B + MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); + MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); +#endif +} + +void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; + BLASLONG idx_target_base0; + + idx_src_base0 = 0; + idx_src_base1 = 1*ldb; + idx_src_base2 = 2*ldb; + idx_src_base3 = 3*ldb; + idx_src_base4 = 4*ldb; + idx_src_base5 = 5*ldb; + idx_src_base6 = 6*ldb; + idx_src_base7 = 7*ldb; + idx_target_base0 = 0; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); + idx_target_base0 += 32*8; + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); +#endif +} + +void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + BLASLONG tag_n_2x = n & (~1); + BLASLONG idx_src_base0; + BLASLONG idx_target_base0; + + BLASLONG LDB_2x = 2*ldb; + + idx_target_base0 = 0; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + idx_src_base0 = 0; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); + idx_src_base0 += LDB_2x; + idx_target_base0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); + idx_target_base0 += 32; + } + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + idx_src_base0 = 0; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); + idx_src_base0 += LDB_2x; + idx_target_base0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + } + } + +#ifdef DEBUG_PROFILE + print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); +#endif +} + +// Scale matrix C while beta is not ZERO or ONE +void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) +{ + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + + BLASLONG LDC4x = ldc*4; + BLASLONG idx_base_0 = 0; + BLASLONG idx_base_1 = ldc; + BLASLONG idx_base_2 = ldc*2; + BLASLONG idx_base_3 = ldc*3; + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m512 array_512_0, array_512_1, array_512_2, array_512_3; + + __m512 BETAVECTOR = _mm512_set1_ps(beta); + + if (Order == CblasColMajor) { + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); + array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); + array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); + array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); + _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); + _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); + _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); + array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); + array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); + array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); + _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); + _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); + _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); + } + + idx_base_0 += LDC4x; + idx_base_1 += LDC4x; + idx_base_2 += LDC4x; + idx_base_3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); + } + idx_base_0 += ldc; + } + } + } else { + + } +} + +// Scale matrix C while beta is not ZERO or ONE +void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) +{ + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + + BLASLONG LDC4x = ldc*4; + BLASLONG idx_base_0 = 0; + BLASLONG idx_base_1 = ldc; + BLASLONG idx_base_2 = ldc*2; + BLASLONG idx_base_3 = ldc*3; + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + __m512 ZEROVECTOR = _mm512_setzero_ps(); + + if (Order == CblasColMajor) { + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); + _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); + } + + idx_base_0 += LDC4x; + idx_base_1 += LDC4x; + idx_base_2 += LDC4x; + idx_base_3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); + } + idx_base_0 += ldc; + } + } + } else { + + } +} \ No newline at end of file diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c new file mode 100644 index 000000000..dd4cb440b --- /dev/null +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -0,0 +1,625 @@ +#include "sbgemm.h" +#include "bf16_common_macros.h" +#include + +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef SBGEMM_BLOCK_KERNEL_32x8x32 +#undef SBGEMM_BLOCK_KERNEL_16x8x32 +#undef SBGEMM_BLOCK_KERNEL_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_16xNx32 +#undef SBGEMM_BLOCKING_KERNEL_2 + +#ifndef ONE_ALPHA // ALPHA is not ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE + #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha + #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha + #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha + #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha +#else // ALPHA is ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE + #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one + #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one + #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one + #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one +#endif + + +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask) + } else { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16])) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7])) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16])) + } +} + +// SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + BLASLONG idxB_base = 0; + BLASLONG width = 32; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + // Load B with unroll 8 + idxB_base = idx_k << 3; + arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]); + arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]); + arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]); + arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]); + arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]); + arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]); + arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]); + arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]); + + if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0))); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1))); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2))); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3))); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4))); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5))); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6))); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7))); + + arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO); + arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO); + arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO); + arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO); + arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO); + arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO); + arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO); + arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO); + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO); + arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO); + arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO); + arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO); + arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO); + arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO); + arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO); + arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO); + } + } + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) + } else { + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) + STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) + STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) + STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) + STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) + STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) + STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) + } +} + +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) + } + } +} + +// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + BLASLONG idxB_base = 0; + BLASLONG width = 32; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i += 2) { + result_512[i] = _mm512_setzero_ps(); + result_512[i+1] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); + idxB_base += 32; + } + + if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) + } + } +} +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); + // TODO: MT + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else if (m_step == 16) { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M - tag_m_Nx; + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); + // TODO: MT + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#else // ALPHA is ONE +void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#endif +{ + bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; + bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; + + // TODO: assume no trans for both A and B, to complement these scenarios later + if (Order == CblasColMajor) { + SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else { + + } +} \ No newline at end of file From 6fe0f1fab9d6a7f46d71d37ebb210fbf56924fbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 11 Jan 2021 19:05:29 +0100 Subject: [PATCH 048/681] Label get_cpu_ftr as volatile to keep gcc from rearranging the code --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 4f1b12f27..37c0694b6 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__("mrs %0, "#id : "=r" (var)); \ + __asm__ __volatile__("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { From ed652d81365e14ac5db62f2abf9db0efa2ff193d Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Mon, 11 Jan 2021 21:13:53 -0500 Subject: [PATCH 049/681] Added definitions for GEMM_PREFERED_SIZE and SWITCH_RATIO to the POWER9 and POWER10 specific sections of param.h. --- param.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/param.h b/param.h index 6c5e0f107..6a790ab61 100644 --- a/param.h +++ b/param.h @@ -2399,6 +2399,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2435,6 +2438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 8 From 2d45a262d999f3ff2121b9fb3898c170a01c4cce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:32:29 +0100 Subject: [PATCH 050/681] Support compilation with nvfortran --- common_arm64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_arm64.h b/common_arm64.h index 9cdded305..2270ffba7 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INLINE inline -#ifdef F_INTERFACE_FLANG +#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX From bff2b7c94d7a1cfa687da0693289c78e44eecc8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:34:18 +0100 Subject: [PATCH 051/681] Support compilation with NVIDIA HPC compilers (which do not take gcc-style arch options) --- Makefile.arm64 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 62a877fff..c3fe583e4 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,4 +1,4 @@ - +ifneq ($(C_COMPILER), PGI) ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a @@ -77,4 +77,4 @@ CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif - +endif From 43aac5bacc7f8f55fa981f990715f914ef739254 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:36:12 +0100 Subject: [PATCH 052/681] Support NVIDIA HPC compiler --- kernel/arm/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 9249b54f8..79baa61b1 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else From c2a8ebfe695fda904ce2ae2153680d0c3810f2ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:38:51 +0100 Subject: [PATCH 053/681] Add workaround for NVIDIA HPC mishandling of the asm DOT kernels --- kernel/arm64/KERNEL.ARMV8 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 603e47d87..c8a53c86b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S From 0f27a036071501664d8c4ee491e02345d9bde115 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:39:35 +0100 Subject: [PATCH 054/681] Add workaround for NVIDIA HPC mishandling of the asm DOT kernels --- kernel/arm64/KERNEL.CORTEXA57 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index dcf2383a9..0be334893 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S From 49959d4f1cf79e2945cf40e3da5964ee2df13710 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:47:15 +0100 Subject: [PATCH 055/681] Add workaround for NVIDIA HPC --- kernel/arm64/KERNEL.CORTEXA53 | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index e23133e52..db322dd0d 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S -DDOTKERNEL = dot.S -SDOTKERNEL = ../generic/dot.c -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S From 2efa3b70dcd90fb15be39f121b91105218b718c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:49:39 +0100 Subject: [PATCH 056/681] Add workaround for NVIDIA HPC --- kernel/arm64/KERNEL.THUNDERX | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index cb02c7bc5..669f62698 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot_thunderx.c DDOTKERNEL = ddot_thunderx.c +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S From b716c0ef010af184fec8d5d33aa9c5cc2fc767b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 16:51:35 +0100 Subject: [PATCH 057/681] Add workaround for NVIDIA HPC --- kernel/arm64/KERNEL.TSV110 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 1ce7bb7c0..54d016e17 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S From 9ccb12b03179b13eedc97eb75ca3dfc7ea406a70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 23:20:07 +0100 Subject: [PATCH 058/681] Add prototypes for cblas_csrot and cblas_zdrot --- cblas.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cblas.h b/cblas.h index da00d46d6..8aafdb186 100644 --- a/cblas.h +++ b/cblas.h @@ -125,6 +125,8 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); +void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); +void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); From ac3e2a3fdd2f2e430ff7b6a58aeb8252afc935de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Jan 2021 23:22:00 +0100 Subject: [PATCH 059/681] Add CBLAS interfaces for csrot and zdrot --- interface/Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 597956fdb..1a440c9c3 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) CZBLAS2OBJS = \ @@ -1664,6 +1664,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + ifeq ($(BUILD_BFLOAT16),1) cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) From 930aff2c2e58f6ffbd0b8a09e1e7029d562749dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 00:27:42 +0100 Subject: [PATCH 060/681] Build CBLAS interfaces for CROTG and ZROTG as well --- Makefile | 2613 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 2274 insertions(+), 339 deletions(-) diff --git a/Makefile b/Makefile index de0735c4a..fab403c82 100644 --- a/Makefile +++ b/Makefile @@ -1,402 +1,2337 @@ -TOPDIR = . -include ./Makefile.system +TOPDIR = .. +include $(TOPDIR)/Makefile.system -BLASDIRS = interface driver/level2 driver/level3 driver/others +SUPPORT_GEMM3M = 0 -ifneq ($(DYNAMIC_ARCH), 1) -BLASDIRS += kernel +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 endif -ifdef SANITY_CHECK -BLASDIRS += reference +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 endif -SUBDIRS = $(BLASDIRS) -ifneq ($(NO_LAPACK), 1) -SUBDIRS += lapack +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 endif -RELA = -ifeq ($(BUILD_RELAPACK), 1) -RELA = re_lapack +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 endif -ifeq ($(NO_FORTRAN), 1) -define NOFORTRAN -1 -endef -define NO_LAPACK -1 -endef -export NOFORTRAN -export NO_LAPACK -endif +ifneq ($(NO_FBLAS), 1) -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) +SBLAS1OBJS = \ + saxpy.$(SUFFIX) sswap.$(SUFFIX) \ + scopy.$(SUFFIX) sscal.$(SUFFIX) \ + sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ + sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ + smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ + smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ + srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ + saxpby.$(SUFFIX) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test +SBLAS2OBJS = \ + sgemv.$(SUFFIX) sger.$(SUFFIX) \ + strsv.$(SUFFIX) strmv.$(SUFFIX) ssymv.$(SUFFIX) \ + ssyr.$(SUFFIX) ssyr2.$(SUFFIX) sgbmv.$(SUFFIX) \ + ssbmv.$(SUFFIX) sspmv.$(SUFFIX) \ + sspr.$(SUFFIX) sspr2.$(SUFFIX) \ + stbsv.$(SUFFIX) stbmv.$(SUFFIX) \ + stpsv.$(SUFFIX) stpmv.$(SUFFIX) -.PHONY : all libs netlib $(RELA) test ctest shared install -.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test +SBLAS3OBJS = \ + sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ + strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ + somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ + sgeadd.$(SUFFIX) -all :: libs netlib $(RELA) tests shared - @echo - @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" - @echo - @echo " OS ... $(OSNAME) " - @echo " Architecture ... $(ARCH) " -ifndef BINARY64 - @echo " BINARY ... 32bit " -else - @echo " BINARY ... 64bit " -endif - -ifdef INTERFACE64 -ifneq ($(INTERFACE64), 0) - @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " -endif -endif - @$(CC) --version > /dev/null 2>&1;\ - if [ $$? -eq 0 ]; then \ - cverinfo=`$(CC) --version | sed -n '1p'`; \ - if [ -z "$${cverinfo}" ]; then \ - cverinfo=`$(CC) --version | sed -n '2p'`; \ - fi; \ - echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ - else \ - echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ - fi -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - @$(FC) --version > /dev/null 2>&1;\ - if [ $$? -eq 0 ]; then \ - fverinfo=`$(FC) --version | sed -n '1p'`; \ - if [ -z "$${fverinfo}" ]; then \ - fverinfo=`$(FC) --version | sed -n '2p'`; \ - fi; \ - echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ - else \ - echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ - fi -endif -ifneq ($(OSNAME), AIX) - @echo -n " Library Name ... $(LIBNAME)" -else - @echo " Library Name ... $(LIBNAME)" +ifeq ($(BUILD_BFLOAT16),1) +SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS2OBJS = sbgemv.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif -ifndef SMP - @echo " (Single-threading) " -else - @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" -endif +DBLAS1OBJS = \ + daxpy.$(SUFFIX) dswap.$(SUFFIX) \ + dcopy.$(SUFFIX) dscal.$(SUFFIX) \ + ddot.$(SUFFIX) \ + dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ + dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ + drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ + daxpby.$(SUFFIX) + +DBLAS2OBJS = \ + dgemv.$(SUFFIX) dger.$(SUFFIX) \ + dtrsv.$(SUFFIX) dtrmv.$(SUFFIX) dsymv.$(SUFFIX) \ + dsyr.$(SUFFIX) dsyr2.$(SUFFIX) dgbmv.$(SUFFIX) \ + dsbmv.$(SUFFIX) dspmv.$(SUFFIX) \ + dspr.$(SUFFIX) dspr2.$(SUFFIX) \ + dtbsv.$(SUFFIX) dtbmv.$(SUFFIX) \ + dtpsv.$(SUFFIX) dtpmv.$(SUFFIX) + +DBLAS3OBJS = \ + dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ + dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ + domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ + dgeadd.$(SUFFIX) + +CBLAS1OBJS = \ + caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ + ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ + cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ + scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scamax.$(SUFFIX) icamax.$(SUFFIX) \ + scamin.$(SUFFIX) icamin.$(SUFFIX) \ + csrot.$(SUFFIX) crotg.$(SUFFIX) \ + caxpby.$(SUFFIX) + +CBLAS2OBJS = \ + cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ + ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ + csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ + csbmv.$(SUFFIX) \ + cspr2.$(SUFFIX) \ + ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ + ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ + chemv.$(SUFFIX) chbmv.$(SUFFIX) \ + cher.$(SUFFIX) cher2.$(SUFFIX) \ + chpmv.$(SUFFIX) chpr.$(SUFFIX) chpr2.$(SUFFIX) + +CBLAS3OBJS = \ + cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ + ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ + chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ + comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ + cgeadd.$(SUFFIX) + +ZBLAS1OBJS = \ + zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ + zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ + zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ + dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzamax.$(SUFFIX) izamax.$(SUFFIX) \ + dzamin.$(SUFFIX) izamin.$(SUFFIX) \ + zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ + zaxpby.$(SUFFIX) + +ZBLAS2OBJS = \ + zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ + ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ + zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ + zspr2.$(SUFFIX) \ + ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ + ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ + zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ + zher.$(SUFFIX) zher2.$(SUFFIX) \ + zhpmv.$(SUFFIX) zhpr.$(SUFFIX) zhpr2.$(SUFFIX) + +ZBLAS3OBJS = \ + zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ + ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ + zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ + zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ + zgeadd.$(SUFFIX) + +ifeq ($(SUPPORT_GEMM3M), 1) + +# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) +CBLAS3OBJS += cgemm3m.$(SUFFIX) + +# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) +ZBLAS3OBJS += zgemm3m.$(SUFFIX) -ifeq ($(DYNAMIC_ARCH), 1) - @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" endif -ifeq ($(USE_OPENMP), 1) - @echo - @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " - @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." - @echo +ifeq ($(EXPRECISION), 1) + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qdot.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifeq ($(SUPPORT_GEMM3M), 1) + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + endif -ifeq ($(OSNAME), Darwin) - @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" - @echo - @echo "\"make PREFIX=/your_installation_path/ install\"." - @echo - @echo "(or set PREFIX in Makefile.rule and run make install." - @echo "If you want to move the .dylib to a new location later, make sure you change" - @echo "the internal name of the dylib with:" - @echo - @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" endif - @echo - @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." - @echo -shared : -ifneq ($(NO_SHARED), 1) -ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) +ifdef QUAD_PRECISION + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifeq ($(SUPPORT_GEMM3M), 1) + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + endif -ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) - @$(MAKE) -C exports so - @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif -ifeq ($(OSNAME), Darwin) - @$(MAKE) -C exports dyn - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib - @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib + endif -ifeq ($(OSNAME), WINNT) - @$(MAKE) -C exports dll + +HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ + dgemv.$(SUFFIX) dtrsv.$(SUFFIX) dger.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) + +CSBLAS1OBJS = \ + cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ + cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ + cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + +CSBLAS2OBJS = \ + cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ + cblas_strsv.$(SUFFIX) cblas_ssyr.$(SUFFIX) cblas_ssyr2.$(SUFFIX) cblas_sgbmv.$(SUFFIX) \ + cblas_ssbmv.$(SUFFIX) cblas_sspmv.$(SUFFIX) cblas_sspr.$(SUFFIX) cblas_sspr2.$(SUFFIX) \ + cblas_stbmv.$(SUFFIX) cblas_stbsv.$(SUFFIX) cblas_stpmv.$(SUFFIX) cblas_stpsv.$(SUFFIX) + +CSBLAS3OBJS = \ + cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ + cblas_sgeadd.$(SUFFIX) + +ifeq ($(BUILD_BFLOAT16),1) +CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif -ifeq ($(OSNAME), CYGWIN_NT) - @$(MAKE) -C exports dll + +CDBLAS1OBJS = \ + cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ + cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ + cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + +CDBLAS2OBJS = \ + cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ + cblas_dtrsv.$(SUFFIX) cblas_dsyr.$(SUFFIX) cblas_dsyr2.$(SUFFIX) cblas_dgbmv.$(SUFFIX) \ + cblas_dsbmv.$(SUFFIX) cblas_dspmv.$(SUFFIX) cblas_dspr.$(SUFFIX) cblas_dspr2.$(SUFFIX) \ + cblas_dtbmv.$(SUFFIX) cblas_dtbsv.$(SUFFIX) cblas_dtpmv.$(SUFFIX) cblas_dtpsv.$(SUFFIX) + +CDBLAS3OBJS += \ + cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ + cblas_dgeadd.$(SUFFIX) + +CCBLAS1OBJS = \ + cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_ccopy.$(SUFFIX) \ + cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ + cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ + cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ + cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ + cblas_caxpby.$(SUFFIX) \ + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) + +CCBLAS2OBJS = \ + cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ + cblas_cgbmv.$(SUFFIX) cblas_chbmv.$(SUFFIX) cblas_chemv.$(SUFFIX) \ + cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ + cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ + cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ + cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) + +CCBLAS3OBJS = \ + cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ + cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ + cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ + cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ + cblas_cgeadd.$(SUFFIX) + +CXERBLAOBJ = \ + cblas_xerbla.$(SUFFIX) + + + +CZBLAS1OBJS = \ + cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) \ + cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ + cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ + cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ + cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ + cblas_zaxpby.$(SUFFIX) \ + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) + + +CZBLAS2OBJS = \ + cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ + cblas_zgbmv.$(SUFFIX) cblas_zhbmv.$(SUFFIX) cblas_zhemv.$(SUFFIX) \ + cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ + cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ + cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ + cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) + +CZBLAS3OBJS = \ + cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ + cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ + cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ + cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ + cblas_zgeadd.$(SUFFIX) + + +ifeq ($(SUPPORT_GEMM3M), 1) + +# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) +CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX) + +# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) +CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) + endif + + +ifneq ($(NO_CBLAS), 1) + +override CFLAGS += -I. + +SBLAS1OBJS += $(CSBLAS1OBJS) +SBLAS2OBJS += $(CSBLAS2OBJS) +SBLAS3OBJS += $(CSBLAS3OBJS) +SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS2OBJS += $(CSBBLAS2OBJS) +SBBLAS3OBJS += $(CSBBLAS3OBJS) +DBLAS1OBJS += $(CDBLAS1OBJS) +DBLAS2OBJS += $(CDBLAS2OBJS) +DBLAS3OBJS += $(CDBLAS3OBJS) +CBLAS1OBJS += $(CCBLAS1OBJS) +CBLAS2OBJS += $(CCBLAS2OBJS) +CBLAS3OBJS += $(CCBLAS3OBJS) +ZBLAS1OBJS += $(CZBLAS1OBJS) +ZBLAS2OBJS += $(CZBLAS2OBJS) +ZBLAS3OBJS += $(CZBLAS3OBJS) + +SBEXTOBJS += $(CSBEXTOBJS) + +CBAUXOBJS += $(CXERBLAOBJ) endif -tests : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - touch $(LIBNAME) -ifndef NO_FBLAS - $(MAKE) -C test all +SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) +DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) +QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) +CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) +ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) +XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) + +#SLAPACKOBJS = \ +# sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ +# spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ +# slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX) + +SLAPACKOBJS = \ + sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ + spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ + slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) + + +#DLAPACKOBJS = \ +# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ +# dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ +# dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX) + +DLAPACKOBJS = \ + dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ + dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ + dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) + + +QLAPACKOBJS = \ + qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ + qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ + qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ + qtrtrs.$(SUFFIX) + +#CLAPACKOBJS = \ +# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ +# cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ +# clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) + +CLAPACKOBJS = \ + cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ + cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ + clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) + +#ZLAPACKOBJS = \ +# zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ +# zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ +# zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX) + + +ZLAPACKOBJS = \ + zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ + zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ + zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) + + +XLAPACKOBJS = \ + xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ + xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ + xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ + xtrtrs.$(SUFFIX) + +ifneq ($(NO_LAPACK), 1) +SBLASOBJS += $(SLAPACKOBJS) +DBLASOBJS += $(DLAPACKOBJS) +#QBLASOBJS += $(QLAPACKOBJS) +CBLASOBJS += $(CLAPACKOBJS) +ZBLASOBJS += $(ZLAPACKOBJS) +#XBLASOBJS += $(XLAPACKOBJS) + endif - $(MAKE) -C utest all -ifneq ($(NO_CBLAS), 1) - $(MAKE) -C ctest all -ifeq ($(CPP_THREAD_SAFETY_TEST), 1) - $(MAKE) -C cpp_thread_test all + +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) + SBLASOBJS = dsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) strsm.$(SUFFIX) \ + sgetrs.$(SUFFIX) sgetrf.$(SUFFIX) spotf2.$(SUFFIX) spotrf.$(SUFFIX) \ + ssyrk.$(SUFFIX) sgemv.$(SUFFIX) endif +ifeq ($(BUILD_COMPLEX),1) + SBLASOBJS = \ + sdot.$(SUFFIX) srot.$(SUFFIX) snrm2.$(SUFFIX) sswap.$(SUFFIX) \ + isamax.$(SUFFIX) saxpy.$(SUFFIX) sscal.$(SUFFIX) scopy.$(SUFFIX) \ + sgemv.$(SUFFIX) sgemm.$(SUFFIX) endif endif - -libs : -ifeq ($(CORE), UNKNOWN) - $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + DBLASOBJS = \ + ddot.$(SUFFIX) drot.$(SUFFIX) dnrm2.$(SUFFIX) dswap.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dscal.$(SUFFIX) dcopy.$(SUFFIX) \ + dgemv.$(SUFFIX) dgemm.$(SUFFIX) endif -ifeq ($(NOFORTRAN), 1) - $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) endif -ifeq ($(NO_STATIC), 1) -ifeq ($(NO_SHARED), 1) - $(error OpenBLAS: neither static nor shared are enabled.) +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS = cgetrs.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) cgetrf.$(SUFFIX) \ + cpotrf.$(SUFFIX) ctrsm.$(SUFFIX) cblas_cdotc_sub.$(SUFFIX) endif endif - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - @for d in $(SUBDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -#Save the config files for installation - @cp Makefile.conf Makefile.conf_last - @cp config.h config_last.h -ifdef QUAD_PRECISION - @echo "#define QUAD_PRECISION">> config_last.h +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= endif + +FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) + ifeq ($(EXPRECISION), 1) - @echo "#define EXPRECISION">> config_last.h -endif -## -ifeq ($(DYNAMIC_ARCH), 1) - @$(MAKE) -C kernel commonlibs || exit 1 - @for d in $(DYNAMIC_CORE) ; \ - do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ - done - @echo DYNAMIC_ARCH=1 >> Makefile.conf_last -ifeq ($(DYNAMIC_OLDER), 1) - @echo DYNAMIC_OLDER=1 >> Makefile.conf_last -endif -endif -ifdef USE_THREAD - @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last -endif - @touch lib.grd - -prof : prof_blas prof_lapack - -prof_blas : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) - for d in $(SUBDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d prof || exit 1 ; \ - fi; \ - done -ifeq ($(DYNAMIC_ARCH), 1) - $(MAKE) -C kernel commonprof || exit 1 -endif - -blas : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - for d in $(BLASDIRS) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d libs || exit 1 ; \ - fi; \ - done - -hpl : - ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) - for d in $(BLASDIRS) ../laswp exports ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -ifeq ($(DYNAMIC_ARCH), 1) - $(MAKE) -C kernel commonlibs || exit 1 - for d in $(DYNAMIC_CORE) ; \ - do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ - done -endif - -hpl_p : - ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) - for d in $(SUBDIRS) ../laswp exports ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done - -ifeq ($(NO_LAPACK), 1) -netlib : +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +endif -else -netlib : lapack_prebuild -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib - @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib +ifeq ($(QUAD_PRECISION), 1) +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif -ifneq ($(NO_LAPACKE), 1) - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib + +FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) + + +include $(TOPDIR)/Makefile.tail + +all :: libs + +ifdef FUNCTION_PROFILE +$(BLASOBJS) $(BLASOBJS_P) : functable.h +$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) + +functable.h : Makefile + ./create $(FUNCALLFILES) > functable.h + endif + +clean :: + @rm -f functable.h + +level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +aux : $(CBAUXOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +$(CSBBLASOBJS) $(CSBBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS + +srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +drot.$(SUFFIX) drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qrot.$(SUFFIX) qrot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csrot.$(SUFFIX) csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zdrot.$(SUFFIX) zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xqrot.$(SUFFIX) xqrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +srotm.$(SUFFIX) srotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotm.$(SUFFIX) drotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotm.$(SUFFIX) qrotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotmg.$(SUFFIX) srotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotmg.$(SUFFIX) drotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotmg.$(SUFFIX) qrotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotg.$(SUFFIX) srotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotg.$(SUFFIX) drotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotg.$(SUFFIX) qrotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xrotg.$(SUFFIX) xrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sasum.$(SUFFIX) sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dasum.$(SUFFIX) dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qasum.$(SUFFIX) qasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scasum.$(SUFFIX) scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dnrm2.$(SUFFIX) dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qnrm2.$(SUFFIX) qnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scnrm2.$(SUFFIX) scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dznrm2.$(SUFFIX) dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxnrm2.$(SUFFIX) qxnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +samax.$(SUFFIX) samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +damax.$(SUFFIX) damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qamax.$(SUFFIX) qamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +scamax.$(SUFFIX) scamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +dzamax.$(SUFFIX) dzamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qxamax.$(SUFFIX) qxamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +samin.$(SUFFIX) samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +damin.$(SUFFIX) damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qamin.$(SUFFIX) qamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +scamin.$(SUFFIX) scamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +dzamin.$(SUFFIX) dzamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qxamin.$(SUFFIX) qxamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +smax.$(SUFFIX) smax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +dmax.$(SUFFIX) dmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +qmax.$(SUFFIX) qmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +smin.$(SUFFIX) smin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +dmin.$(SUFFIX) dmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +qmin.$(SUFFIX) qmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +isamax.$(SUFFIX) isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +idamax.$(SUFFIX) idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +iqamax.$(SUFFIX) iqamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +icamax.$(SUFFIX) icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +izamax.$(SUFFIX) izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +ixamax.$(SUFFIX) ixamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +isamin.$(SUFFIX) isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +idamin.$(SUFFIX) idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +iqamin.$(SUFFIX) iqamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +icamin.$(SUFFIX) icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +izamin.$(SUFFIX) izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ixamin.$(SUFFIX) ixamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ismax.$(SUFFIX) ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +idmax.$(SUFFIX) idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +iqmax.$(SUFFIX) iqmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +ismin.$(SUFFIX) ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +idmin.$(SUFFIX) idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +iqmin.$(SUFFIX) iqmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) +sbstobf16.$(SUFFIX) sbstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +sbdtobf16.$(SUFFIX) sbdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +dbf16tod.$(SUFFIX) dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) endif -ifeq ($(NO_LAPACK), 1) -re_lapack : +sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) -else -re_lapack : - @$(MAKE) -C relapack -endif - -prof_lapack : lapack_prebuild - @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof - -lapack_prebuild : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) - -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc -else - -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(F_COMPILER), GFORTRAN) - -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc -ifdef SMP -ifeq ($(OSNAME), WINNT) - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -else ifeq ($(OSNAME), Haiku) - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -else - -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc +ddot.$(SUFFIX) ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qdot.$(SUFFIX) qdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cdotu.$(SUFFIX) cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +cdotc.$(SUFFIX) cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zdotu.$(SUFFIX) zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +zdotc.$(SUFFIX) zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xdotu.$(SUFFIX) xdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +xdotc.$(SUFFIX) xdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpy.$(SUFFIX) caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zaxpy.$(SUFFIX) zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xaxpy.$(SUFFIX) xaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpyc.$(SUFFIX) caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zaxpyc.$(SUFFIX) zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xaxpyc.$(SUFFIX) xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qscal.$(SUFFIX) qscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cscal.$(SUFFIX) cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zscal.$(SUFFIX) zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xscal.$(SUFFIX) xscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csscal.$(SUFFIX) csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +zdscal.$(SUFFIX) zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +xqscal.$(SUFFIX) xqscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +scopy.$(SUFFIX) scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dcopy.$(SUFFIX) dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qcopy.$(SUFFIX) qcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ccopy.$(SUFFIX) ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zcopy.$(SUFFIX) zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xcopy.$(SUFFIX) xcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sswap.$(SUFFIX) sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dswap.$(SUFFIX) dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qswap.$(SUFFIX) qswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cswap.$(SUFFIX) cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zswap.$(SUFFIX) zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xswap.$(SUFFIX) xswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sger.$(SUFFIX) sger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dger.$(SUFFIX) dger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qger.$(SUFFIX) qger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeru.$(SUFFIX) cgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +cgerc.$(SUFFIX) cgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +zgeru.$(SUFFIX) zgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +zgerc.$(SUFFIX) zgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -c $< -o $(@F) endif + +ifndef USE_NETLIB_GEMV +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< else - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< endif + +qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +ifndef USE_NETLIB_GEMV +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< else - -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_LAPACK_DEPRECATED), 1) - -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_SINGLE), 1) - -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_DOUBLE), 1) - -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_COMPLEX), 1) - -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif -ifeq ($(BUILD_COMPLEX16), 1) - -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -endif - -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc - -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc -endif +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< -large.tgz : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - if [ ! -a $< ]; then - -wget http://www.netlib.org/lapack/timing/large.tgz; - fi +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f + $(FC) -c $(FFLAGS) -o $(@F) $< endif -timing.tgz : -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - if [ ! -a $< ]; then - -wget http://www.netlib.org/lapack/timing/timing.tgz; - fi -endif +xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< -lapack-timing : large.tgz timing.tgz -ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) - (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) - (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) - $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING -endif +strsv.$(SUFFIX) strsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) +dtrsv.$(SUFFIX) dtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) -lapack-test : - (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc -ifneq ($(CROSS), 1) - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ - ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) -endif +qtrsv.$(SUFFIX) qtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) -lapack-runtest: - ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ - ./testsecond; ./testdsecnd; ./testieee; ./testversion ) - (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) +ctrsv.$(SUFFIX) ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) +ztrsv.$(SUFFIX) ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) -blas-test: - (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) - $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing - (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) +xtrsv.$(SUFFIX) xtrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) +strmv.$(SUFFIX) strmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) -dummy : +dtrmv.$(SUFFIX) dtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) -install : - $(MAKE) -f Makefile.install install +qtrmv.$(SUFFIX) qtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrmv.$(SUFFIX) ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrmv.$(SUFFIX) ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrmv.$(SUFFIX) xtrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymv.$(SUFFIX) ssymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr.$(SUFFIX) ssyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2.$(SUFFIX) ssyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2.$(SUFFIX) dsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2.$(SUFFIX) qsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2.$(SUFFIX) csyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2.$(SUFFIX) zsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2.$(SUFFIX) xsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgbmv.$(SUFFIX) sgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgbmv.$(SUFFIX) dgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +qgbmv.$(SUFFIX) qgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +cgbmv.$(SUFFIX) cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgbmv.$(SUFFIX) zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +xgbmv.$(SUFFIX) xgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +ssbmv.$(SUFFIX) ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsbmv.$(SUFFIX) dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsbmv.$(SUFFIX) qsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csbmv.$(SUFFIX) csbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsbmv.$(SUFFIX) zsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsbmv.$(SUFFIX) xsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspmv.$(SUFFIX) sspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr.$(SUFFIX) sspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr2.$(SUFFIX) sspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr2.$(SUFFIX) dspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr2.$(SUFFIX) qspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr2.$(SUFFIX) cspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr2.$(SUFFIX) zspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr2.$(SUFFIX) xspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbmv.$(SUFFIX) stbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbmv.$(SUFFIX) dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbmv.$(SUFFIX) qtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbmv.$(SUFFIX) ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbmv.$(SUFFIX) ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbmv.$(SUFFIX) xtbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbsv.$(SUFFIX) stbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbsv.$(SUFFIX) dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbsv.$(SUFFIX) qtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbsv.$(SUFFIX) ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbsv.$(SUFFIX) ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbsv.$(SUFFIX) xtbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpsv.$(SUFFIX) stpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpsv.$(SUFFIX) dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpsv.$(SUFFIX) qtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpsv.$(SUFFIX) ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpsv.$(SUFFIX) ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpsv.$(SUFFIX) xtpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpmv.$(SUFFIX) stpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpmv.$(SUFFIX) dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpmv.$(SUFFIX) qtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpmv.$(SUFFIX) ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpmv.$(SUFFIX) ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpmv.$(SUFFIX) xtpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemv.$(SUFFIX) chemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhemv.$(SUFFIX) zhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhemv.$(SUFFIX) xhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chbmv.$(SUFFIX) chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhbmv.$(SUFFIX) zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhbmv.$(SUFFIX) xhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher.$(SUFFIX) cher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher.$(SUFFIX) zher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher.$(SUFFIX) xher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher2.$(SUFFIX) cher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher2.$(SUFFIX) zher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher2.$(SUFFIX) xher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpmv.$(SUFFIX) chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpmv.$(SUFFIX) zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpmv.$(SUFFIX) xhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr.$(SUFFIX) chpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr.$(SUFFIX) zhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr.$(SUFFIX) xhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr2.$(SUFFIX) chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) +endif + +sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemm.$(SUFFIX) dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemm.$(SUFFIX) qgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemm.$(SUFFIX) cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymm.$(SUFFIX) dsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymm.$(SUFFIX) qsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymm.$(SUFFIX) csymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymm.$(SUFFIX) zsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymm.$(SUFFIX) xsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strmm.$(SUFFIX) strmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +dtrmm.$(SUFFIX) dtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +qtrmm.$(SUFFIX) qtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ctrmm.$(SUFFIX) ctrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ztrmm.$(SUFFIX) ztrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +xtrmm.$(SUFFIX) xtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +strsm.$(SUFFIX) strsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrsm.$(SUFFIX) dtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrsm.$(SUFFIX) qtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrsm.$(SUFFIX) ctrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrsm.$(SUFFIX) ztrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrsm.$(SUFFIX) xtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyrk.$(SUFFIX) ssyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyrk.$(SUFFIX) dsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyrk.$(SUFFIX) qsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyrk.$(SUFFIX) csyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyrk.$(SUFFIX) zsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyrk.$(SUFFIX) xsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2k.$(SUFFIX) ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2k.$(SUFFIX) dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2k.$(SUFFIX) qsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2k.$(SUFFIX) csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2k.$(SUFFIX) zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2k.$(SUFFIX) xsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemm.$(SUFFIX) chemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zhemm.$(SUFFIX) zhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xhemm.$(SUFFIX) xhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cherk.$(SUFFIX) cherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zherk.$(SUFFIX) zherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xherk.$(SUFFIX) xherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cher2k.$(SUFFIX) cher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zher2k.$(SUFFIX) zher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xher2k.$(SUFFIX) xher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cgemm3m.$(SUFFIX) cgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zgemm3m.$(SUFFIX) zgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xgemm3m.$(SUFFIX) xgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +csymm3m.$(SUFFIX) csymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zsymm3m.$(SUFFIX) zsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xsymm3m.$(SUFFIX) xsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +chemm3m.$(SUFFIX) chemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +zhemm3m.$(SUFFIX) zhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +xhemm3m.$(SUFFIX) xhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +cblas_isamax.$(SUFFIX) cblas_isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idamax.$(SUFFIX) cblas_idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idmax.$(SUFFIX) cblas_idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dasum.$(SUFFIX) cblas_dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_sbstobf16.$(SUFFIX) cblas_sbstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_sbdtobf16.$(SUFFIX) cblas_sbdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_dbf16tod.$(SUFFIX) cblas_dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +endif + +cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cdotu.$(SUFFIX) cblas_cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_cdotc.$(SUFFIX) cblas_cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zdotu.$(SUFFIX) cblas_zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_zdotc.$(SUFFIX) cblas_zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_cdotu_sub.$(SUFFIX) cblas_cdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_cdotc_sub.$(SUFFIX) cblas_cdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_zdotu_sub.$(SUFFIX) cblas_zdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_zdotc_sub.$(SUFFIX) cblas_zdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_snrm2.$(SUFFIX) cblas_snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dnrm2.$(SUFFIX) cblas_dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scnrm2.$(SUFFIX) cblas_scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dznrm2.$(SUFFIX) cblas_dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scopy.$(SUFFIX) cblas_scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dcopy.$(SUFFIX) cblas_dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ccopy.$(SUFFIX) cblas_ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zcopy.$(SUFFIX) cblas_zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sswap.$(SUFFIX) cblas_sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dswap.$(SUFFIX) cblas_dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cswap.$(SUFFIX) cblas_cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zswap.$(SUFFIX) cblas_zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srot.$(SUFFIX) cblas_srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drot.$(SUFFIX) cblas_drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotm.$(SUFFIX) cblas_drotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotmg.$(SUFFIX) cblas_srotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotmg.$(SUFFIX) cblas_drotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sscal.$(SUFFIX) cblas_sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dscal.$(SUFFIX) cblas_dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cscal.$(SUFFIX) cblas_cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zscal.$(SUFFIX) cblas_zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgemv.$(SUFFIX) cblas_dgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgemv.$(SUFFIX) cblas_cgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgemv.$(SUFFIX) cblas_zgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_sger.$(SUFFIX) cblas_sger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dger.$(SUFFIX) cblas_dger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgeru.$(SUFFIX) cblas_cgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_cgerc.$(SUFFIX) cblas_cgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_zgeru.$(SUFFIX) cblas_zgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_zgerc.$(SUFFIX) cblas_zgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_strsv.$(SUFFIX) cblas_strsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsv.$(SUFFIX) cblas_dtrsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsv.$(SUFFIX) cblas_ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsv.$(SUFFIX) cblas_ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmv.$(SUFFIX) cblas_strmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrmv.$(SUFFIX) cblas_dtrmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrmv.$(SUFFIX) cblas_ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrmv.$(SUFFIX) cblas_ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr.$(SUFFIX) cblas_ssyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr.$(SUFFIX) cblas_dsyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher.$(SUFFIX) cblas_cher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher.$(SUFFIX) cblas_zher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2.$(SUFFIX) cblas_ssyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2.$(SUFFIX) cblas_dsyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher2.$(SUFFIX) cblas_cher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher2.$(SUFFIX) cblas_zher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgbmv.$(SUFFIX) cblas_sgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgbmv.$(SUFFIX) cblas_dgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgbmv.$(SUFFIX) cblas_cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgbmv.$(SUFFIX) cblas_zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_ssbmv.$(SUFFIX) cblas_ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsbmv.$(SUFFIX) cblas_dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chbmv.$(SUFFIX) cblas_chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhbmv.$(SUFFIX) cblas_zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspmv.$(SUFFIX) cblas_sspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspmv.$(SUFFIX) cblas_dspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr.$(SUFFIX) cblas_sspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr.$(SUFFIX) cblas_dspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr.$(SUFFIX) cblas_chpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr.$(SUFFIX) cblas_zhpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr2.$(SUFFIX) cblas_sspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr2.$(SUFFIX) cblas_dspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr2.$(SUFFIX) cblas_chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr2.$(SUFFIX) cblas_zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbmv.$(SUFFIX) cblas_stbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbmv.$(SUFFIX) cblas_dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbmv.$(SUFFIX) cblas_ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbmv.$(SUFFIX) cblas_ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbsv.$(SUFFIX) cblas_stbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbsv.$(SUFFIX) cblas_dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbsv.$(SUFFIX) cblas_ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbsv.$(SUFFIX) cblas_ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpmv.$(SUFFIX) cblas_stpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpmv.$(SUFFIX) cblas_dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpmv.$(SUFFIX) cblas_ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpmv.$(SUFFIX) cblas_ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpmv.$(SUFFIX) cblas_chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpmv.$(SUFFIX) cblas_zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpsv.$(SUFFIX) cblas_stpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpsv.$(SUFFIX) cblas_dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpsv.$(SUFFIX) cblas_ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpsv.$(SUFFIX) cblas_ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymv.$(SUFFIX) cblas_ssymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymv.$(SUFFIX) cblas_dsymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemv.$(SUFFIX) cblas_chemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + +cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymm.$(SUFFIX) cblas_dsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csymm.$(SUFFIX) cblas_csymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsymm.$(SUFFIX) cblas_zsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyrk.$(SUFFIX) cblas_ssyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyrk.$(SUFFIX) cblas_dsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyrk.$(SUFFIX) cblas_csyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyrk.$(SUFFIX) cblas_zsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2k.$(SUFFIX) cblas_ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2k.$(SUFFIX) cblas_dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyr2k.$(SUFFIX) cblas_csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyr2k.$(SUFFIX) cblas_zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmm.$(SUFFIX) cblas_strmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_dtrmm.$(SUFFIX) cblas_dtrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ctrmm.$(SUFFIX) cblas_ctrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ztrmm.$(SUFFIX) cblas_ztrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_strsm.$(SUFFIX) cblas_strsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsm.$(SUFFIX) cblas_dtrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsm.$(SUFFIX) cblas_ctrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsm.$(SUFFIX) cblas_ztrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemm.$(SUFFIX) cblas_chemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zhemm.$(SUFFIX) cblas_zhemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cherk.$(SUFFIX) cblas_cherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zherk.$(SUFFIX) cblas_zherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c + $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c + $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) + + +sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : lapack/getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : lapack/zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : lapack/zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : lapack/getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : lapack/getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : lapack/zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : lapack/zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lapack/lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lapack/lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : lapack/zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : lapack/zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauum.$(SUFFIX) slauum.$(PSUFFIX) : lapack/lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lapack/lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauum.$(SUFFIX) clauum.$(PSUFFIX) : lapack/zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : lapack/zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : lapack/potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : lapack/potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : lapack/zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : lapack/zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : lapack/potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : lapack/potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : lapack/zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : lapack/zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strti2.$(SUFFIX) strti2.$(PSUFFIX) : lapack/trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : lapack/trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : lapack/ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : lapack/laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : lapack/laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +claswp.$(SUFFIX) claswp.$(PSUFFIX) : lapack/zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : lapack/zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : lapack/gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : lapack/gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : lapack/gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotri.$(SUFFIX) spotri.$(PSUFFIX) : lapack/potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : lapack/potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : lapack/zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : lapack/zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slarf.$(SUFFIX) slarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlarf.$(SUFFIX) dlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlarf.$(SUFFIX) qlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clarf.$(SUFFIX) clarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + + +############# BLAS EXTENSIONS ##################################### + +daxpby.$(SUFFIX) daxpby.$(PSUFFIX) : axpby.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cblas_daxpby.$(SUFFIX) cblas_daxpby.$(PSUFFIX) : axpby.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +saxpby.$(SUFFIX) saxpby.$(PSUFFIX) : axpby.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cblas_saxpby.$(SUFFIX) cblas_saxpby.$(PSUFFIX) : axpby.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +zaxpby.$(SUFFIX) zaxpby.$(PSUFFIX) : zaxpby.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cblas_zaxpby.$(SUFFIX) cblas_zaxpby.$(PSUFFIX) : zaxpby.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +caxpby.$(SUFFIX) caxpby.$(PSUFFIX) : zaxpby.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) -clean :: - @for d in $(SUBDIRS_ALL) ; \ - do if test -d $$d; then \ - $(MAKE) -C $$d $(@F) || exit 1 ; \ - fi; \ - done -#ifdef DYNAMIC_ARCH - @$(MAKE) -C kernel clean -#endif - @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 -ifeq ($(OSNAME), Darwin) - @rm -rf getarch.dSYM getarch_2nd.dSYM -endif - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib - @rm -f cblas.tmp cblas.tmp2 - @touch $(NETLIB_LAPACK_DIR)/make.inc - @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean - @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h - @$(MAKE) -C relapack clean - @rm -f *.grd Makefile.conf_last config_last.h - @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) - @echo Done. From bc5b35367fa8e8ba7b949641f95d308540c3920b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 00:28:43 +0100 Subject: [PATCH 061/681] restore Makefile after accidental overwrite --- Makefile | 2613 +++++++----------------------------------------------- 1 file changed, 339 insertions(+), 2274 deletions(-) diff --git a/Makefile b/Makefile index fab403c82..de0735c4a 100644 --- a/Makefile +++ b/Makefile @@ -1,2337 +1,402 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system +TOPDIR = . +include ./Makefile.system -SUPPORT_GEMM3M = 0 +BLASDIRS = interface driver/level2 driver/level3 driver/others -ifeq ($(ARCH), x86) -SUPPORT_GEMM3M = 1 +ifneq ($(DYNAMIC_ARCH), 1) +BLASDIRS += kernel endif -ifeq ($(ARCH), x86_64) -SUPPORT_GEMM3M = 1 +ifdef SANITY_CHECK +BLASDIRS += reference endif -ifeq ($(ARCH), ia64) -SUPPORT_GEMM3M = 1 +SUBDIRS = $(BLASDIRS) +ifneq ($(NO_LAPACK), 1) +SUBDIRS += lapack endif -ifeq ($(ARCH), MIPS) -SUPPORT_GEMM3M = 1 +RELA = +ifeq ($(BUILD_RELAPACK), 1) +RELA = re_lapack endif -ifneq ($(NO_FBLAS), 1) - -SBLAS1OBJS = \ - saxpy.$(SUFFIX) sswap.$(SUFFIX) \ - scopy.$(SUFFIX) sscal.$(SUFFIX) \ - sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ - sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ - smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ - smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ - srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ - saxpby.$(SUFFIX) - -SBLAS2OBJS = \ - sgemv.$(SUFFIX) sger.$(SUFFIX) \ - strsv.$(SUFFIX) strmv.$(SUFFIX) ssymv.$(SUFFIX) \ - ssyr.$(SUFFIX) ssyr2.$(SUFFIX) sgbmv.$(SUFFIX) \ - ssbmv.$(SUFFIX) sspmv.$(SUFFIX) \ - sspr.$(SUFFIX) sspr2.$(SUFFIX) \ - stbsv.$(SUFFIX) stbmv.$(SUFFIX) \ - stpsv.$(SUFFIX) stpmv.$(SUFFIX) - -SBLAS3OBJS = \ - sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ - strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ - somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) - -ifeq ($(BUILD_BFLOAT16),1) -SBBLAS1OBJS = sbdot.$(SUFFIX) -SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) -SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) +ifeq ($(NO_FORTRAN), 1) +define NOFORTRAN +1 +endef +define NO_LAPACK +1 +endef +export NOFORTRAN +export NO_LAPACK endif -DBLAS1OBJS = \ - daxpy.$(SUFFIX) dswap.$(SUFFIX) \ - dcopy.$(SUFFIX) dscal.$(SUFFIX) \ - ddot.$(SUFFIX) \ - dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ - dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ - dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ - drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ - daxpby.$(SUFFIX) - -DBLAS2OBJS = \ - dgemv.$(SUFFIX) dger.$(SUFFIX) \ - dtrsv.$(SUFFIX) dtrmv.$(SUFFIX) dsymv.$(SUFFIX) \ - dsyr.$(SUFFIX) dsyr2.$(SUFFIX) dgbmv.$(SUFFIX) \ - dsbmv.$(SUFFIX) dspmv.$(SUFFIX) \ - dspr.$(SUFFIX) dspr2.$(SUFFIX) \ - dtbsv.$(SUFFIX) dtbmv.$(SUFFIX) \ - dtpsv.$(SUFFIX) dtpmv.$(SUFFIX) - -DBLAS3OBJS = \ - dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ - dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ - domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) - -CBLAS1OBJS = \ - caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ - ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ - cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ - scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ - scamax.$(SUFFIX) icamax.$(SUFFIX) \ - scamin.$(SUFFIX) icamin.$(SUFFIX) \ - csrot.$(SUFFIX) crotg.$(SUFFIX) \ - caxpby.$(SUFFIX) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -CBLAS2OBJS = \ - cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ - ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ - csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) \ - cspr2.$(SUFFIX) \ - ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ - ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ - chemv.$(SUFFIX) chbmv.$(SUFFIX) \ - cher.$(SUFFIX) cher2.$(SUFFIX) \ - chpmv.$(SUFFIX) chpr.$(SUFFIX) chpr2.$(SUFFIX) +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test -CBLAS3OBJS = \ - cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ - ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ - chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ - comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) - -ZBLAS1OBJS = \ - zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ - zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ - zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ - dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ - dzamax.$(SUFFIX) izamax.$(SUFFIX) \ - dzamin.$(SUFFIX) izamin.$(SUFFIX) \ - zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ - zaxpby.$(SUFFIX) - -ZBLAS2OBJS = \ - zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ - ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ - zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) \ - zspr2.$(SUFFIX) \ - ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ - ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ - zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ - zher.$(SUFFIX) zher2.$(SUFFIX) \ - zhpmv.$(SUFFIX) zhpr.$(SUFFIX) zhpr2.$(SUFFIX) - -ZBLAS3OBJS = \ - zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ - ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ - zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ - zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) - -ifeq ($(SUPPORT_GEMM3M), 1) - -# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) -CBLAS3OBJS += cgemm3m.$(SUFFIX) - -# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) -ZBLAS3OBJS += zgemm3m.$(SUFFIX) +.PHONY : all libs netlib $(RELA) test ctest shared install +.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test +all :: libs netlib $(RELA) tests shared + @echo + @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" + @echo + @echo " OS ... $(OSNAME) " + @echo " Architecture ... $(ARCH) " +ifndef BINARY64 + @echo " BINARY ... 32bit " +else + @echo " BINARY ... 64bit " +endif + +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) + @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " +endif +endif + @$(CC) --version > /dev/null 2>&1;\ + if [ $$? -eq 0 ]; then \ + cverinfo=`$(CC) --version | sed -n '1p'`; \ + if [ -z "$${cverinfo}" ]; then \ + cverinfo=`$(CC) --version | sed -n '2p'`; \ + fi; \ + echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ + else \ + echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ + fi +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + @$(FC) --version > /dev/null 2>&1;\ + if [ $$? -eq 0 ]; then \ + fverinfo=`$(FC) --version | sed -n '1p'`; \ + if [ -z "$${fverinfo}" ]; then \ + fverinfo=`$(FC) --version | sed -n '2p'`; \ + fi; \ + echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ + else \ + echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ + fi +endif +ifneq ($(OSNAME), AIX) + @echo -n " Library Name ... $(LIBNAME)" +else + @echo " Library Name ... $(LIBNAME)" endif -ifeq ($(EXPRECISION), 1) - -QBLAS1OBJS = \ - qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ - qcopy.$(SUFFIX) qscal.$(SUFFIX) \ - qdot.$(SUFFIX) \ - qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ - qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ - qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ - qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ - -QBLAS2OBJS = \ - qgemv.$(SUFFIX) qger.$(SUFFIX) \ - qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ - qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ - qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ - qspr.$(SUFFIX) qspr2.$(SUFFIX) \ - qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ - qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) - -QBLAS3OBJS = \ - qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ - qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) - -XBLAS1OBJS = \ - xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ - xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ - xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ - qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ - qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ - qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ - xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ - -XBLAS2OBJS = \ - xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ - xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ - xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ - xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ - xspr.$(SUFFIX) xspr2.$(SUFFIX) \ - xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ - xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ - xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ - xher.$(SUFFIX) xher2.$(SUFFIX) \ - xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) - -XBLAS3OBJS = \ - xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ - xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ - xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) - -ifeq ($(SUPPORT_GEMM3M), 1) - -XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) - +ifndef SMP + @echo " (Single-threading) " +else + @echo " (Multi-threading; Max num-threads is $(NUM_THREADS))" endif +ifeq ($(DYNAMIC_ARCH), 1) + @echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)" endif -ifdef QUAD_PRECISION - -QBLAS1OBJS = \ - qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ - qcopy.$(SUFFIX) qscal.$(SUFFIX) \ - qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ - qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ - qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ - qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ - -QBLAS2OBJS = \ - qgemv.$(SUFFIX) qger.$(SUFFIX) \ - qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ - qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ - qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ - qspr.$(SUFFIX) qspr2.$(SUFFIX) \ - qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ - qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) - -QBLAS3OBJS = \ - qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ - qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) - -XBLAS1OBJS = \ - xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ - xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ - qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ - qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ - qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ - xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ - -XBLAS2OBJS = \ - xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ - xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ - xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ - xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ - xspr.$(SUFFIX) xspr2.$(SUFFIX) \ - xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ - xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ - xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ - xher.$(SUFFIX) xher2.$(SUFFIX) \ - xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) - -XBLAS3OBJS = \ - xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ - xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ - xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) - -ifeq ($(SUPPORT_GEMM3M), 1) +ifeq ($(USE_OPENMP), 1) + @echo + @echo " Use OpenMP in the multithreading. Because of ignoring OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS flags, " + @echo " you should use OMP_NUM_THREADS environment variable to control the number of threads." + @echo +endif -XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) +ifeq ($(OSNAME), Darwin) + @echo "WARNING: If you plan to use the dynamic library $(LIBDYNNAME), you must run:" + @echo + @echo "\"make PREFIX=/your_installation_path/ install\"." + @echo + @echo "(or set PREFIX in Makefile.rule and run make install." + @echo "If you want to move the .dylib to a new location later, make sure you change" + @echo "the internal name of the dylib with:" + @echo + @echo "install_name_tool -id /new/absolute/path/to/$(LIBDYNNAME) $(LIBDYNNAME)" +endif + @echo + @echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"." + @echo +shared : +ifneq ($(NO_SHARED), 1) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) + @$(MAKE) -C exports so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif +ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) + @$(MAKE) -C exports so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif - +ifeq ($(OSNAME), Darwin) + @$(MAKE) -C exports dyn + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib endif - -HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ - dgemv.$(SUFFIX) dtrsv.$(SUFFIX) dger.$(SUFFIX) \ - idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) - -CSBLAS1OBJS = \ - cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ - cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ - cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ - cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) - -CSBLAS2OBJS = \ - cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ - cblas_strsv.$(SUFFIX) cblas_ssyr.$(SUFFIX) cblas_ssyr2.$(SUFFIX) cblas_sgbmv.$(SUFFIX) \ - cblas_ssbmv.$(SUFFIX) cblas_sspmv.$(SUFFIX) cblas_sspr.$(SUFFIX) cblas_sspr2.$(SUFFIX) \ - cblas_stbmv.$(SUFFIX) cblas_stbsv.$(SUFFIX) cblas_stpmv.$(SUFFIX) cblas_stpsv.$(SUFFIX) - -CSBLAS3OBJS = \ - cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ - cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) - -ifeq ($(BUILD_BFLOAT16),1) -CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) -CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) -CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +ifeq ($(OSNAME), WINNT) + @$(MAKE) -C exports dll endif - -CDBLAS1OBJS = \ - cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ - cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ - cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ - cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) - -CDBLAS2OBJS = \ - cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ - cblas_dtrsv.$(SUFFIX) cblas_dsyr.$(SUFFIX) cblas_dsyr2.$(SUFFIX) cblas_dgbmv.$(SUFFIX) \ - cblas_dsbmv.$(SUFFIX) cblas_dspmv.$(SUFFIX) cblas_dspr.$(SUFFIX) cblas_dspr2.$(SUFFIX) \ - cblas_dtbmv.$(SUFFIX) cblas_dtbsv.$(SUFFIX) cblas_dtpmv.$(SUFFIX) cblas_dtpsv.$(SUFFIX) - -CDBLAS3OBJS += \ - cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ - cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) - -CCBLAS1OBJS = \ - cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ - cblas_ccopy.$(SUFFIX) \ - cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ - cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ - cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ - cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) - -CCBLAS2OBJS = \ - cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ - cblas_cgbmv.$(SUFFIX) cblas_chbmv.$(SUFFIX) cblas_chemv.$(SUFFIX) \ - cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ - cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ - cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ - cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) - -CCBLAS3OBJS = \ - cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ - cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ - cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ - cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) - -CXERBLAOBJ = \ - cblas_xerbla.$(SUFFIX) - - - -CZBLAS1OBJS = \ - cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ - cblas_zcopy.$(SUFFIX) \ - cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ - cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ - cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ - cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) - - -CZBLAS2OBJS = \ - cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ - cblas_zgbmv.$(SUFFIX) cblas_zhbmv.$(SUFFIX) cblas_zhemv.$(SUFFIX) \ - cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ - cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ - cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ - cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) - -CZBLAS3OBJS = \ - cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ - cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ - cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ - cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) - - -ifeq ($(SUPPORT_GEMM3M), 1) - -# CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) -CCBLAS3OBJS += cblas_cgemm3m.$(SUFFIX) - -# ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) -CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) - +ifeq ($(OSNAME), CYGWIN_NT) + @$(MAKE) -C exports dll endif - - -ifneq ($(NO_CBLAS), 1) - -override CFLAGS += -I. - -SBLAS1OBJS += $(CSBLAS1OBJS) -SBLAS2OBJS += $(CSBLAS2OBJS) -SBLAS3OBJS += $(CSBLAS3OBJS) -SBBLAS1OBJS += $(CSBBLAS1OBJS) -SBBLAS2OBJS += $(CSBBLAS2OBJS) -SBBLAS3OBJS += $(CSBBLAS3OBJS) -DBLAS1OBJS += $(CDBLAS1OBJS) -DBLAS2OBJS += $(CDBLAS2OBJS) -DBLAS3OBJS += $(CDBLAS3OBJS) -CBLAS1OBJS += $(CCBLAS1OBJS) -CBLAS2OBJS += $(CCBLAS2OBJS) -CBLAS3OBJS += $(CCBLAS3OBJS) -ZBLAS1OBJS += $(CZBLAS1OBJS) -ZBLAS2OBJS += $(CZBLAS2OBJS) -ZBLAS3OBJS += $(CZBLAS3OBJS) - -SBEXTOBJS += $(CSBEXTOBJS) - -CBAUXOBJS += $(CXERBLAOBJ) endif -SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) -DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) -QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) -CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) -ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) -XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) - -#SLAPACKOBJS = \ -# sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ -# spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ -# slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) spotri.$(SUFFIX) - -SLAPACKOBJS = \ - sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ - spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ - slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) - - -#DLAPACKOBJS = \ -# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ -# dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ -# dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dpotri.$(SUFFIX) - -DLAPACKOBJS = \ - dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ - dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ - dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) - - -QLAPACKOBJS = \ - qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ - qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ - qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ - qtrtrs.$(SUFFIX) - -#CLAPACKOBJS = \ -# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ -# cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ -# clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) - -CLAPACKOBJS = \ - cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ - cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ - clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) - -#ZLAPACKOBJS = \ -# zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ -# zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ -# zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) zpotri.$(SUFFIX) - - -ZLAPACKOBJS = \ - zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ - zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ - zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) - - -XLAPACKOBJS = \ - xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ - xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ - xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ - xtrtrs.$(SUFFIX) - -ifneq ($(NO_LAPACK), 1) -SBLASOBJS += $(SLAPACKOBJS) -DBLASOBJS += $(DLAPACKOBJS) -#QBLASOBJS += $(QLAPACKOBJS) -CBLASOBJS += $(CLAPACKOBJS) -ZBLASOBJS += $(ZLAPACKOBJS) -#XBLASOBJS += $(XLAPACKOBJS) - +tests : +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + touch $(LIBNAME) +ifndef NO_FBLAS + $(MAKE) -C test all endif - -ifneq ($(BUILD_SINGLE),1) - SBLASOBJS= -ifeq ($(BUILD_DOUBLE),1) - SBLASOBJS = dsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) strsm.$(SUFFIX) \ - sgetrs.$(SUFFIX) sgetrf.$(SUFFIX) spotf2.$(SUFFIX) spotrf.$(SUFFIX) \ - ssyrk.$(SUFFIX) sgemv.$(SUFFIX) + $(MAKE) -C utest all +ifneq ($(NO_CBLAS), 1) + $(MAKE) -C ctest all +ifeq ($(CPP_THREAD_SAFETY_TEST), 1) + $(MAKE) -C cpp_thread_test all endif -ifeq ($(BUILD_COMPLEX),1) - SBLASOBJS = \ - sdot.$(SUFFIX) srot.$(SUFFIX) snrm2.$(SUFFIX) sswap.$(SUFFIX) \ - isamax.$(SUFFIX) saxpy.$(SUFFIX) sscal.$(SUFFIX) scopy.$(SUFFIX) \ - sgemv.$(SUFFIX) sgemm.$(SUFFIX) endif endif -ifneq ($(BUILD_DOUBLE),1) - DBLASOBJS= -ifeq ($(BUILD_COMPLEX16),1) - DBLASOBJS = \ - ddot.$(SUFFIX) drot.$(SUFFIX) dnrm2.$(SUFFIX) dswap.$(SUFFIX) \ - idamax.$(SUFFIX) daxpy.$(SUFFIX) dscal.$(SUFFIX) dcopy.$(SUFFIX) \ - dgemv.$(SUFFIX) dgemm.$(SUFFIX) + +libs : +ifeq ($(CORE), UNKNOWN) + $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) endif +ifeq ($(NOFORTRAN), 1) + $(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.) endif -ifneq ($(BUILD_COMPLEX),1) - CBLASOBJS= -ifeq ($(BUILD_COMPLEX16),1) - CBLASOBJS = cgetrs.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) cgetrf.$(SUFFIX) \ - cpotrf.$(SUFFIX) ctrsm.$(SUFFIX) cblas_cdotc_sub.$(SUFFIX) +ifeq ($(NO_STATIC), 1) +ifeq ($(NO_SHARED), 1) + $(error OpenBLAS: neither static nor shared are enabled.) endif endif -ifneq ($(BUILD_COMPLEX16),1) - ZBLASOBJS= + @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + @for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +#Save the config files for installation + @cp Makefile.conf Makefile.conf_last + @cp config.h config_last.h +ifdef QUAD_PRECISION + @echo "#define QUAD_PRECISION">> config_last.h endif - -FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) - ifeq ($(EXPRECISION), 1) -FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) -endif + @echo "#define EXPRECISION">> config_last.h +endif +## +ifeq ($(DYNAMIC_ARCH), 1) + @$(MAKE) -C kernel commonlibs || exit 1 + @for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done + @echo DYNAMIC_ARCH=1 >> Makefile.conf_last +ifeq ($(DYNAMIC_OLDER), 1) + @echo DYNAMIC_OLDER=1 >> Makefile.conf_last +endif +endif +ifdef USE_THREAD + @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last +endif + @touch lib.grd + +prof : prof_blas prof_lapack + +prof_blas : + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d prof || exit 1 ; \ + fi; \ + done +ifeq ($(DYNAMIC_ARCH), 1) + $(MAKE) -C kernel commonprof || exit 1 +endif + +blas : + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + for d in $(BLASDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d libs || exit 1 ; \ + fi; \ + done + +hpl : + ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) + for d in $(BLASDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifeq ($(DYNAMIC_ARCH), 1) + $(MAKE) -C kernel commonlibs || exit 1 + for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done +endif + +hpl_p : + ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + +ifeq ($(NO_LAPACK), 1) +netlib : -ifeq ($(QUAD_PRECISION), 1) -FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +else +netlib : lapack_prebuild +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib + @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif - -FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) - - -include $(TOPDIR)/Makefile.tail - -all :: libs - -ifdef FUNCTION_PROFILE -$(BLASOBJS) $(BLASOBJS_P) : functable.h -$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) - -functable.h : Makefile - ./create $(FUNCALLFILES) > functable.h - +ifneq ($(NO_LAPACKE), 1) + @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif - -clean :: - @rm -f functable.h - -level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) - $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ - -level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) - $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ - -level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) - $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ - -aux : $(CBAUXOBJS) - $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ - -$(CSBBLASOBJS) $(CSBBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ -$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS - -srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -drot.$(SUFFIX) drot.$(PSUFFIX) : rot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qrot.$(SUFFIX) qrot.$(PSUFFIX) : rot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -csrot.$(SUFFIX) csrot.$(PSUFFIX) : zrot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -zdrot.$(SUFFIX) zdrot.$(PSUFFIX) : zrot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -xqrot.$(SUFFIX) xqrot.$(PSUFFIX) : zrot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -srotm.$(SUFFIX) srotm.$(PSUFFIX): rotm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -drotm.$(SUFFIX) drotm.$(PSUFFIX): rotm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qrotm.$(SUFFIX) qrotm.$(PSUFFIX): rotm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -srotmg.$(SUFFIX) srotmg.$(PSUFFIX): rotmg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -drotmg.$(SUFFIX) drotmg.$(PSUFFIX): rotmg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qrotmg.$(SUFFIX) qrotmg.$(PSUFFIX): rotmg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -srotg.$(SUFFIX) srotg.$(PSUFFIX): rotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -drotg.$(SUFFIX) drotg.$(PSUFFIX): rotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qrotg.$(SUFFIX) qrotg.$(PSUFFIX): rotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xrotg.$(SUFFIX) xrotg.$(PSUFFIX): zrotg.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sasum.$(SUFFIX) sasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dasum.$(SUFFIX) dasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qasum.$(SUFFIX) qasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -scasum.$(SUFFIX) scasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dnrm2.$(SUFFIX) dnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qnrm2.$(SUFFIX) qnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -scnrm2.$(SUFFIX) scnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dznrm2.$(SUFFIX) dznrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qxnrm2.$(SUFFIX) qxnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -samax.$(SUFFIX) samax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -damax.$(SUFFIX) damax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -qamax.$(SUFFIX) qamax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -scamax.$(SUFFIX) scamax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -dzamax.$(SUFFIX) dzamax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -qxamax.$(SUFFIX) qxamax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -samin.$(SUFFIX) samin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -damin.$(SUFFIX) damin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -qamin.$(SUFFIX) qamin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -scamin.$(SUFFIX) scamin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -dzamin.$(SUFFIX) dzamin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -qxamin.$(SUFFIX) qxamin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -smax.$(SUFFIX) smax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -dmax.$(SUFFIX) dmax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -qmax.$(SUFFIX) qmax.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -smin.$(SUFFIX) smin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -dmin.$(SUFFIX) dmin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -qmin.$(SUFFIX) qmin.$(PSUFFIX) : max.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -isamax.$(SUFFIX) isamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -idamax.$(SUFFIX) idamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -iqamax.$(SUFFIX) iqamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -icamax.$(SUFFIX) icamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -izamax.$(SUFFIX) izamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -ixamax.$(SUFFIX) ixamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -isamin.$(SUFFIX) isamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -idamin.$(SUFFIX) idamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -iqamin.$(SUFFIX) iqamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -icamin.$(SUFFIX) icamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -izamin.$(SUFFIX) izamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -ixamin.$(SUFFIX) ixamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -ismax.$(SUFFIX) ismax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -idmax.$(SUFFIX) idmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -iqmax.$(SUFFIX) iqmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -ismin.$(SUFFIX) ismin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -idmin.$(SUFFIX) idmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -iqmin.$(SUFFIX) iqmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c - $(CC) $(CFLAGS) -c $< -o $(@F) -sbstobf16.$(SUFFIX) sbstobf16.$(PSUFFIX) : tobf16.c - $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -sbdtobf16.$(SUFFIX) sbdtobf16.$(PSUFFIX) : tobf16.c - $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) -sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c - $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -dbf16tod.$(SUFFIX) dbf16tod.$(PSUFFIX) : bf16to.c - $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) endif -sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -ddot.$(SUFFIX) ddot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qdot.$(SUFFIX) qdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cdotu.$(SUFFIX) cdotu.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) - -cdotc.$(SUFFIX) cdotc.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -zdotu.$(SUFFIX) zdotu.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) - -zdotc.$(SUFFIX) zdotc.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -xdotu.$(SUFFIX) xdotu.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) - -xdotc.$(SUFFIX) xdotc.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) +ifeq ($(NO_LAPACK), 1) +re_lapack : -caxpy.$(SUFFIX) caxpy.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -zaxpy.$(SUFFIX) zaxpy.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -xaxpy.$(SUFFIX) xaxpy.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -caxpyc.$(SUFFIX) caxpyc.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -zaxpyc.$(SUFFIX) zaxpyc.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -xaxpyc.$(SUFFIX) xaxpyc.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) - -sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qscal.$(SUFFIX) qscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cscal.$(SUFFIX) cscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -zscal.$(SUFFIX) zscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -xscal.$(SUFFIX) xscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -csscal.$(SUFFIX) csscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) - -zdscal.$(SUFFIX) zdscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) - -xqscal.$(SUFFIX) xqscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) - -scopy.$(SUFFIX) scopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dcopy.$(SUFFIX) dcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qcopy.$(SUFFIX) qcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -ccopy.$(SUFFIX) ccopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -zcopy.$(SUFFIX) zcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -xcopy.$(SUFFIX) xcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -sswap.$(SUFFIX) sswap.$(PSUFFIX) : swap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dswap.$(SUFFIX) dswap.$(PSUFFIX) : swap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qswap.$(SUFFIX) qswap.$(PSUFFIX) : swap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cswap.$(SUFFIX) cswap.$(PSUFFIX) : zswap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -zswap.$(SUFFIX) zswap.$(PSUFFIX) : zswap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -xswap.$(SUFFIX) xswap.$(PSUFFIX) : zswap.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -sger.$(SUFFIX) sger.$(PSUFFIX) : ger.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dger.$(SUFFIX) dger.$(PSUFFIX) : ger.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qger.$(SUFFIX) qger.$(PSUFFIX) : ger.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgeru.$(SUFFIX) cgeru.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) - -cgerc.$(SUFFIX) cgerc.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) - -zgeru.$(SUFFIX) zgeru.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) - -zgerc.$(SUFFIX) zgerc.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) - -xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) - -xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c - $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c - $(CC) $(CFLAGS) -c $< -o $(@F) +else +re_lapack : + @$(MAKE) -C relapack +endif + +prof_lapack : lapack_prebuild + @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof + +lapack_prebuild : +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc +else + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif + -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc +ifeq ($(F_COMPILER), GFORTRAN) + -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc +ifdef SMP +ifeq ($(OSNAME), WINNT) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else ifeq ($(OSNAME), Haiku) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else + -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc endif - -ifndef USE_NETLIB_GEMV -sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< else -sgemv.$(SUFFIX) sgemv.$(PSUFFIX): netlib/sgemv.f - $(FC) -c $(FFLAGS) -o $(@F) $< - -dgemv.$(SUFFIX) dgemv.$(PSUFFIX): netlib/dgemv.f - $(FC) -c $(FFLAGS) -o $(@F) $< + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc endif - -qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -ifndef USE_NETLIB_GEMV -cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< else -cgemv.$(SUFFIX) cgemv.$(PSUFFIX): netlib/cgemv.f - $(FC) -c $(FFLAGS) -o $(@F) $< - -zgemv.$(SUFFIX) zgemv.$(PSUFFIX): netlib/zgemv.f - $(FC) -c $(FFLAGS) -o $(@F) $< + -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_LAPACK_DEPRECATED), 1) + -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif + -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif -xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -strsv.$(SUFFIX) strsv.$(PSUFFIX) : trsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrsv.$(SUFFIX) dtrsv.$(PSUFFIX) : trsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtrsv.$(SUFFIX) qtrsv.$(PSUFFIX) : trsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctrsv.$(SUFFIX) ctrsv.$(PSUFFIX) : ztrsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztrsv.$(SUFFIX) ztrsv.$(PSUFFIX) : ztrsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtrsv.$(SUFFIX) xtrsv.$(PSUFFIX) : ztrsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -strmv.$(SUFFIX) strmv.$(PSUFFIX) : trmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrmv.$(SUFFIX) dtrmv.$(PSUFFIX) : trmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +large.tgz : +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + if [ ! -a $< ]; then + -wget http://www.netlib.org/lapack/timing/large.tgz; + fi +endif -qtrmv.$(SUFFIX) qtrmv.$(PSUFFIX) : trmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +timing.tgz : +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + if [ ! -a $< ]; then + -wget http://www.netlib.org/lapack/timing/timing.tgz; + fi +endif -ctrmv.$(SUFFIX) ctrmv.$(PSUFFIX) : ztrmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +lapack-timing : large.tgz timing.tgz +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) + (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) + (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) + $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING +endif -ztrmv.$(SUFFIX) ztrmv.$(PSUFFIX) : ztrmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) -xtrmv.$(SUFFIX) xtrmv.$(PSUFFIX) : ztrmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +lapack-test : + (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc +ifneq ($(CROSS), 1) + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \ + ./testsecond; ./testdsecnd; ./testieee; ./testversion ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING) +endif -ssymv.$(SUFFIX) ssymv.$(PSUFFIX) : symv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +lapack-runtest: + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ./testsecond; ./testdsecnd; ./testieee; ./testversion ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) -dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c - $(CC) -c $(CFLAGS) $< -o $(@F) -qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +blas-test: + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing + (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) -csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c - $(CC) -c $(CFLAGS) $< -o $(@F) -zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c - $(CC) -c $(CFLAGS) $< -o $(@F) +dummy : -xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ssyr.$(SUFFIX) ssyr.$(PSUFFIX) : syr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ssyr2.$(SUFFIX) ssyr2.$(PSUFFIX) : syr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsyr2.$(SUFFIX) dsyr2.$(PSUFFIX) : syr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsyr2.$(SUFFIX) qsyr2.$(PSUFFIX) : syr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csyr2.$(SUFFIX) csyr2.$(PSUFFIX) : zsyr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsyr2.$(SUFFIX) zsyr2.$(PSUFFIX) : zsyr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsyr2.$(SUFFIX) xsyr2.$(PSUFFIX) : zsyr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sgbmv.$(SUFFIX) sgbmv.$(PSUFFIX): gbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -dgbmv.$(SUFFIX) dgbmv.$(PSUFFIX): gbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -qgbmv.$(SUFFIX) qgbmv.$(PSUFFIX): gbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -cgbmv.$(SUFFIX) cgbmv.$(PSUFFIX): zgbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -zgbmv.$(SUFFIX) zgbmv.$(PSUFFIX): zgbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -xgbmv.$(SUFFIX) xgbmv.$(PSUFFIX): zgbmv.c - $(CC) -c $(CFLAGS) -o $(@F) $< - -ssbmv.$(SUFFIX) ssbmv.$(PSUFFIX) : sbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsbmv.$(SUFFIX) dsbmv.$(PSUFFIX) : sbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsbmv.$(SUFFIX) qsbmv.$(PSUFFIX) : sbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csbmv.$(SUFFIX) csbmv.$(PSUFFIX) : zsbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsbmv.$(SUFFIX) zsbmv.$(PSUFFIX) : zsbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsbmv.$(SUFFIX) xsbmv.$(PSUFFIX) : zsbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sspmv.$(SUFFIX) sspmv.$(PSUFFIX) : spmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sspr.$(SUFFIX) sspr.$(PSUFFIX) : spr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sspr2.$(SUFFIX) sspr2.$(PSUFFIX) : spr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dspr2.$(SUFFIX) dspr2.$(PSUFFIX) : spr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qspr2.$(SUFFIX) qspr2.$(PSUFFIX) : spr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cspr2.$(SUFFIX) cspr2.$(PSUFFIX) : zspr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zspr2.$(SUFFIX) zspr2.$(PSUFFIX) : zspr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xspr2.$(SUFFIX) xspr2.$(PSUFFIX) : zspr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -stbmv.$(SUFFIX) stbmv.$(PSUFFIX) : tbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtbmv.$(SUFFIX) dtbmv.$(PSUFFIX) : tbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtbmv.$(SUFFIX) qtbmv.$(PSUFFIX) : tbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctbmv.$(SUFFIX) ctbmv.$(PSUFFIX) : ztbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztbmv.$(SUFFIX) ztbmv.$(PSUFFIX) : ztbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtbmv.$(SUFFIX) xtbmv.$(PSUFFIX) : ztbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -stbsv.$(SUFFIX) stbsv.$(PSUFFIX) : tbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtbsv.$(SUFFIX) dtbsv.$(PSUFFIX) : tbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtbsv.$(SUFFIX) qtbsv.$(PSUFFIX) : tbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctbsv.$(SUFFIX) ctbsv.$(PSUFFIX) : ztbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztbsv.$(SUFFIX) ztbsv.$(PSUFFIX) : ztbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtbsv.$(SUFFIX) xtbsv.$(PSUFFIX) : ztbsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -stpsv.$(SUFFIX) stpsv.$(PSUFFIX) : tpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtpsv.$(SUFFIX) dtpsv.$(PSUFFIX) : tpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtpsv.$(SUFFIX) qtpsv.$(PSUFFIX) : tpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctpsv.$(SUFFIX) ctpsv.$(PSUFFIX) : ztpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztpsv.$(SUFFIX) ztpsv.$(PSUFFIX) : ztpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtpsv.$(SUFFIX) xtpsv.$(PSUFFIX) : ztpsv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -stpmv.$(SUFFIX) stpmv.$(PSUFFIX) : tpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtpmv.$(SUFFIX) dtpmv.$(PSUFFIX) : tpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtpmv.$(SUFFIX) qtpmv.$(PSUFFIX) : tpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctpmv.$(SUFFIX) ctpmv.$(PSUFFIX) : ztpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztpmv.$(SUFFIX) ztpmv.$(PSUFFIX) : ztpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtpmv.$(SUFFIX) xtpmv.$(PSUFFIX) : ztpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chemv.$(SUFFIX) chemv.$(PSUFFIX) : zhemv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zhemv.$(SUFFIX) zhemv.$(PSUFFIX) : zhemv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xhemv.$(SUFFIX) xhemv.$(PSUFFIX) : zhemv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chbmv.$(SUFFIX) chbmv.$(PSUFFIX) : zhbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zhbmv.$(SUFFIX) zhbmv.$(PSUFFIX) : zhbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xhbmv.$(SUFFIX) xhbmv.$(PSUFFIX) : zhbmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cher.$(SUFFIX) cher.$(PSUFFIX) : zher.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zher.$(SUFFIX) zher.$(PSUFFIX) : zher.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xher.$(SUFFIX) xher.$(PSUFFIX) : zher.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cher2.$(SUFFIX) cher2.$(PSUFFIX) : zher2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zher2.$(SUFFIX) zher2.$(PSUFFIX) : zher2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xher2.$(SUFFIX) xher2.$(PSUFFIX) : zher2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chpmv.$(SUFFIX) chpmv.$(PSUFFIX) : zhpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zhpmv.$(SUFFIX) zhpmv.$(PSUFFIX) : zhpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xhpmv.$(SUFFIX) xhpmv.$(PSUFFIX) : zhpmv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chpr.$(SUFFIX) chpr.$(PSUFFIX) : zhpr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zhpr.$(SUFFIX) zhpr.$(PSUFFIX) : zhpr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xhpr.$(SUFFIX) xhpr.$(PSUFFIX) : zhpr.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chpr2.$(SUFFIX) chpr2.$(PSUFFIX) : zhpr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) -endif - -sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgemm.$(SUFFIX) dgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -qgemm.$(SUFFIX) qgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgemm.$(SUFFIX) cgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) - -ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsymm.$(SUFFIX) dsymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsymm.$(SUFFIX) qsymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csymm.$(SUFFIX) csymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsymm.$(SUFFIX) zsymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsymm.$(SUFFIX) xsymm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -strmm.$(SUFFIX) strmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -dtrmm.$(SUFFIX) dtrmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -qtrmm.$(SUFFIX) qtrmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -ctrmm.$(SUFFIX) ctrmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -ztrmm.$(SUFFIX) ztrmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -xtrmm.$(SUFFIX) xtrmm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) - -strsm.$(SUFFIX) strsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrsm.$(SUFFIX) dtrsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtrsm.$(SUFFIX) qtrsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctrsm.$(SUFFIX) ctrsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztrsm.$(SUFFIX) ztrsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtrsm.$(SUFFIX) xtrsm.$(PSUFFIX) : trsm.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ssyrk.$(SUFFIX) ssyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsyrk.$(SUFFIX) dsyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsyrk.$(SUFFIX) qsyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csyrk.$(SUFFIX) csyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsyrk.$(SUFFIX) zsyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsyrk.$(SUFFIX) xsyrk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ssyr2k.$(SUFFIX) ssyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dsyr2k.$(SUFFIX) dsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qsyr2k.$(SUFFIX) qsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -csyr2k.$(SUFFIX) csyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zsyr2k.$(SUFFIX) zsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xsyr2k.$(SUFFIX) xsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -chemm.$(SUFFIX) chemm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -zhemm.$(SUFFIX) zhemm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -xhemm.$(SUFFIX) xhemm.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -cherk.$(SUFFIX) cherk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -zherk.$(SUFFIX) zherk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -xherk.$(SUFFIX) xherk.$(PSUFFIX) : syrk.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -cher2k.$(SUFFIX) cher2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -zher2k.$(SUFFIX) zher2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -xher2k.$(SUFFIX) xher2k.$(PSUFFIX) : syr2k.c - $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) - -cgemm3m.$(SUFFIX) cgemm3m.$(PSUFFIX) : gemm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -zgemm3m.$(SUFFIX) zgemm3m.$(PSUFFIX) : gemm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -xgemm3m.$(SUFFIX) xgemm3m.$(PSUFFIX) : gemm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -csymm3m.$(SUFFIX) csymm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -zsymm3m.$(SUFFIX) zsymm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -xsymm3m.$(SUFFIX) xsymm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -chemm3m.$(SUFFIX) chemm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) - -zhemm3m.$(SUFFIX) zhemm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) - -xhemm3m.$(SUFFIX) xhemm3m.$(PSUFFIX) : symm.c - $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) - -cblas_isamax.$(SUFFIX) cblas_isamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_idamax.$(SUFFIX) cblas_idamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_isamin.$(SUFFIX) cblas_isamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_idamin.$(SUFFIX) cblas_idamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_icamin.$(SUFFIX) cblas_icamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_izamin.$(SUFFIX) cblas_izamin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_idmax.$(SUFFIX) cblas_idmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dasum.$(SUFFIX) cblas_dasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -cblas_sbstobf16.$(SUFFIX) cblas_sbstobf16.$(PSUFFIX) : tobf16.c - $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -cblas_sbdtobf16.$(SUFFIX) cblas_sbdtobf16.$(PSUFFIX) : tobf16.c - $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) -cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c - $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -cblas_dbf16tod.$(SUFFIX) cblas_dbf16tod.$(PSUFFIX) : bf16to.c - $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) -endif - -cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_cdotu.$(SUFFIX) cblas_cdotu.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) - -cblas_cdotc.$(SUFFIX) cblas_cdotc.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) - -cblas_zdotu.$(SUFFIX) cblas_zdotu.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) - -cblas_zdotc.$(SUFFIX) cblas_zdotc.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) - -cblas_cdotu_sub.$(SUFFIX) cblas_cdotu_sub.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) - -cblas_cdotc_sub.$(SUFFIX) cblas_cdotc_sub.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) - -cblas_zdotu_sub.$(SUFFIX) cblas_zdotu_sub.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) - -cblas_zdotc_sub.$(SUFFIX) cblas_zdotc_sub.$(PSUFFIX) : zdot.c - $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) - -cblas_snrm2.$(SUFFIX) cblas_snrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dnrm2.$(SUFFIX) cblas_dnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_scnrm2.$(SUFFIX) cblas_scnrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dznrm2.$(SUFFIX) cblas_dznrm2.$(PSUFFIX) : nrm2.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_scopy.$(SUFFIX) cblas_scopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dcopy.$(SUFFIX) cblas_dcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_ccopy.$(SUFFIX) cblas_ccopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_zcopy.$(SUFFIX) cblas_zcopy.$(PSUFFIX) : copy.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_sswap.$(SUFFIX) cblas_sswap.$(PSUFFIX) : swap.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dswap.$(SUFFIX) cblas_dswap.$(PSUFFIX) : swap.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_cswap.$(SUFFIX) cblas_cswap.$(PSUFFIX) : zswap.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_zswap.$(SUFFIX) cblas_zswap.$(PSUFFIX) : zswap.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_srot.$(SUFFIX) cblas_srot.$(PSUFFIX) : rot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_drot.$(SUFFIX) cblas_drot.$(PSUFFIX) : rot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_drotm.$(SUFFIX) cblas_drotm.$(PSUFFIX): rotm.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_srotmg.$(SUFFIX) cblas_srotmg.$(PSUFFIX): rotmg.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_drotmg.$(SUFFIX) cblas_drotmg.$(PSUFFIX): rotmg.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_sscal.$(SUFFIX) cblas_sscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dscal.$(SUFFIX) cblas_dscal.$(PSUFFIX) : scal.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_cscal.$(SUFFIX) cblas_cscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_zscal.$(SUFFIX) cblas_zscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) - -cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c - $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) - -cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) -endif - -cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_dgemv.$(SUFFIX) cblas_dgemv.$(PSUFFIX): gemv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_cgemv.$(SUFFIX) cblas_cgemv.$(PSUFFIX): zgemv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_zgemv.$(SUFFIX) cblas_zgemv.$(PSUFFIX): zgemv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_sger.$(SUFFIX) cblas_sger.$(PSUFFIX) : ger.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dger.$(SUFFIX) cblas_dger.$(PSUFFIX) : ger.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_cgeru.$(SUFFIX) cblas_cgeru.$(PSUFFIX) : zger.c - $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) - -cblas_cgerc.$(SUFFIX) cblas_cgerc.$(PSUFFIX) : zger.c - $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) - -cblas_zgeru.$(SUFFIX) cblas_zgeru.$(PSUFFIX) : zger.c - $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) - -cblas_zgerc.$(SUFFIX) cblas_zgerc.$(PSUFFIX) : zger.c - $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) - -cblas_strsv.$(SUFFIX) cblas_strsv.$(PSUFFIX) : trsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtrsv.$(SUFFIX) cblas_dtrsv.$(PSUFFIX) : trsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctrsv.$(SUFFIX) cblas_ctrsv.$(PSUFFIX) : ztrsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztrsv.$(SUFFIX) cblas_ztrsv.$(PSUFFIX) : ztrsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_strmv.$(SUFFIX) cblas_strmv.$(PSUFFIX) : trmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtrmv.$(SUFFIX) cblas_dtrmv.$(PSUFFIX) : trmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctrmv.$(SUFFIX) cblas_ctrmv.$(PSUFFIX) : ztrmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztrmv.$(SUFFIX) cblas_ztrmv.$(PSUFFIX) : ztrmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssyr.$(SUFFIX) cblas_ssyr.$(PSUFFIX) : syr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsyr.$(SUFFIX) cblas_dsyr.$(PSUFFIX) : syr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_cher.$(SUFFIX) cblas_cher.$(PSUFFIX) : zher.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zher.$(SUFFIX) cblas_zher.$(PSUFFIX) : zher.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssyr2.$(SUFFIX) cblas_ssyr2.$(PSUFFIX) : syr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsyr2.$(SUFFIX) cblas_dsyr2.$(PSUFFIX) : syr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_cher2.$(SUFFIX) cblas_cher2.$(PSUFFIX) : zher2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zher2.$(SUFFIX) cblas_zher2.$(PSUFFIX) : zher2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_sgbmv.$(SUFFIX) cblas_sgbmv.$(PSUFFIX): gbmv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_dgbmv.$(SUFFIX) cblas_dgbmv.$(PSUFFIX): gbmv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_cgbmv.$(SUFFIX) cblas_cgbmv.$(PSUFFIX): zgbmv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_zgbmv.$(SUFFIX) cblas_zgbmv.$(PSUFFIX): zgbmv.c - $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< - -cblas_ssbmv.$(SUFFIX) cblas_ssbmv.$(PSUFFIX) : sbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsbmv.$(SUFFIX) cblas_dsbmv.$(PSUFFIX) : sbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chbmv.$(SUFFIX) cblas_chbmv.$(PSUFFIX) : zhbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zhbmv.$(SUFFIX) cblas_zhbmv.$(PSUFFIX) : zhbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_sspmv.$(SUFFIX) cblas_sspmv.$(PSUFFIX) : spmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dspmv.$(SUFFIX) cblas_dspmv.$(PSUFFIX) : spmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_sspr.$(SUFFIX) cblas_sspr.$(PSUFFIX) : spr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dspr.$(SUFFIX) cblas_dspr.$(PSUFFIX) : spr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chpr.$(SUFFIX) cblas_chpr.$(PSUFFIX) : zhpr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zhpr.$(SUFFIX) cblas_zhpr.$(PSUFFIX) : zhpr.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_sspr2.$(SUFFIX) cblas_sspr2.$(PSUFFIX) : spr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dspr2.$(SUFFIX) cblas_dspr2.$(PSUFFIX) : spr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chpr2.$(SUFFIX) cblas_chpr2.$(PSUFFIX) : zhpr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zhpr2.$(SUFFIX) cblas_zhpr2.$(PSUFFIX) : zhpr2.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_stbmv.$(SUFFIX) cblas_stbmv.$(PSUFFIX) : tbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtbmv.$(SUFFIX) cblas_dtbmv.$(PSUFFIX) : tbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctbmv.$(SUFFIX) cblas_ctbmv.$(PSUFFIX) : ztbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztbmv.$(SUFFIX) cblas_ztbmv.$(PSUFFIX) : ztbmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_stbsv.$(SUFFIX) cblas_stbsv.$(PSUFFIX) : tbsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtbsv.$(SUFFIX) cblas_dtbsv.$(PSUFFIX) : tbsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctbsv.$(SUFFIX) cblas_ctbsv.$(PSUFFIX) : ztbsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztbsv.$(SUFFIX) cblas_ztbsv.$(PSUFFIX) : ztbsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_stpmv.$(SUFFIX) cblas_stpmv.$(PSUFFIX) : tpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtpmv.$(SUFFIX) cblas_dtpmv.$(PSUFFIX) : tpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctpmv.$(SUFFIX) cblas_ctpmv.$(PSUFFIX) : ztpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztpmv.$(SUFFIX) cblas_ztpmv.$(PSUFFIX) : ztpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chpmv.$(SUFFIX) cblas_chpmv.$(PSUFFIX) : zhpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zhpmv.$(SUFFIX) cblas_zhpmv.$(PSUFFIX) : zhpmv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_stpsv.$(SUFFIX) cblas_stpsv.$(PSUFFIX) : tpsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtpsv.$(SUFFIX) cblas_dtpsv.$(PSUFFIX) : tpsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctpsv.$(SUFFIX) cblas_ctpsv.$(PSUFFIX) : ztpsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztpsv.$(SUFFIX) cblas_ztpsv.$(PSUFFIX) : ztpsv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssymv.$(SUFFIX) cblas_ssymv.$(PSUFFIX) : symv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsymv.$(SUFFIX) cblas_dsymv.$(PSUFFIX) : symv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chemv.$(SUFFIX) cblas_chemv.$(PSUFFIX) : zhemv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -ifeq ($(BUILD_BFLOAT16),1) -cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) -endif - -cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsymm.$(SUFFIX) cblas_dsymm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_csymm.$(SUFFIX) cblas_csymm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zsymm.$(SUFFIX) cblas_zsymm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssyrk.$(SUFFIX) cblas_ssyrk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsyrk.$(SUFFIX) cblas_dsyrk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_csyrk.$(SUFFIX) cblas_csyrk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zsyrk.$(SUFFIX) cblas_zsyrk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ssyr2k.$(SUFFIX) cblas_ssyr2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dsyr2k.$(SUFFIX) cblas_dsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_csyr2k.$(SUFFIX) cblas_csyr2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_zsyr2k.$(SUFFIX) cblas_zsyr2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_strmm.$(SUFFIX) cblas_strmm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) - -cblas_dtrmm.$(SUFFIX) cblas_dtrmm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) - -cblas_ctrmm.$(SUFFIX) cblas_ctrmm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) - -cblas_ztrmm.$(SUFFIX) cblas_ztrmm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) - -cblas_strsm.$(SUFFIX) cblas_strsm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_dtrsm.$(SUFFIX) cblas_dtrsm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ctrsm.$(SUFFIX) cblas_ctrsm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_ztrsm.$(SUFFIX) cblas_ztrsm.$(PSUFFIX) : trsm.c - $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) - -cblas_chemm.$(SUFFIX) cblas_chemm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_zhemm.$(SUFFIX) cblas_zhemm.$(PSUFFIX) : symm.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_cherk.$(SUFFIX) cblas_cherk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_zherk.$(SUFFIX) cblas_zherk.$(PSUFFIX) : syrk.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c - $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) - -cblas_cgemm3m.$(SUFFIX) cblas_cgemm3m.$(PSUFFIX) : gemm.c - $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) - -cblas_zgemm3m.$(SUFFIX) cblas_zgemm3m.$(PSUFFIX) : gemm.c - $(CC) -DCBLAS -c $(CFLAGS) -DGEMM3M $< -o $(@F) - - -sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : lapack/getf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : lapack/getf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : lapack/zgetf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : lapack/zgetf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : lapack/getrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : lapack/getrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : lapack/zgetrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : lapack/zgetrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lapack/lauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lapack/lauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : lapack/zlauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : lapack/zlauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -slauum.$(SUFFIX) slauum.$(PSUFFIX) : lapack/lauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lapack/lauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -clauum.$(SUFFIX) clauum.$(PSUFFIX) : lapack/zlauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : lapack/zlauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : lapack/potf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : lapack/potf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : lapack/zpotf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : lapack/zpotf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : lapack/potrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : lapack/potrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : lapack/zpotrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : lapack/zpotrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -strti2.$(SUFFIX) strti2.$(PSUFFIX) : lapack/trti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : lapack/trti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : lapack/ztrti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : lapack/ztrti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -strtri.$(SUFFIX) strtri.$(PSUFFIX) : lapack/trtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : lapack/trtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : lapack/ztrtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : lapack/ztrtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : lapack/laswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : lapack/laswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -claswp.$(SUFFIX) claswp.$(PSUFFIX) : lapack/zlaswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : lapack/zlaswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : lapack/gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : lapack/gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : lapack/gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -spotri.$(SUFFIX) spotri.$(PSUFFIX) : lapack/potri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : lapack/potri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : lapack/zpotri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : lapack/zpotri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -slarf.$(SUFFIX) slarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dlarf.$(SUFFIX) dlarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -qlarf.$(SUFFIX) qlarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -clarf.$(SUFFIX) clarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c - $(CC) -c $(CFLAGS) $< -o $(@F) - - -############# BLAS EXTENSIONS ##################################### - -daxpby.$(SUFFIX) daxpby.$(PSUFFIX) : axpby.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cblas_daxpby.$(SUFFIX) cblas_daxpby.$(PSUFFIX) : axpby.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -saxpby.$(SUFFIX) saxpby.$(PSUFFIX) : axpby.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cblas_saxpby.$(SUFFIX) cblas_saxpby.$(PSUFFIX) : axpby.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -zaxpby.$(SUFFIX) zaxpby.$(PSUFFIX) : zaxpby.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cblas_zaxpby.$(SUFFIX) cblas_zaxpby.$(PSUFFIX) : zaxpby.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -caxpby.$(SUFFIX) caxpby.$(PSUFFIX) : zaxpby.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -cblas_caxpby.$(SUFFIX) cblas_caxpby.$(PSUFFIX) : zaxpby.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -domatcopy.$(SUFFIX) domatcopy.$(PSUFFIX) : omatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_domatcopy.$(SUFFIX) cblas_domatcopy.$(PSUFFIX) : omatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -somatcopy.$(SUFFIX) somatcopy.$(PSUFFIX) : omatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_somatcopy.$(SUFFIX) cblas_somatcopy.$(PSUFFIX) : omatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -comatcopy.$(SUFFIX) comatcopy.$(PSUFFIX) : zomatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_comatcopy.$(SUFFIX) cblas_comatcopy.$(PSUFFIX) : zomatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -zomatcopy.$(SUFFIX) zomatcopy.$(PSUFFIX) : zomatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_zomatcopy.$(SUFFIX) cblas_zomatcopy.$(PSUFFIX) : zomatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -dimatcopy.$(SUFFIX) dimatcopy.$(PSUFFIX) : imatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_dimatcopy.$(SUFFIX) cblas_dimatcopy.$(PSUFFIX) : imatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -simatcopy.$(SUFFIX) simatcopy.$(PSUFFIX) : imatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_simatcopy.$(SUFFIX) cblas_simatcopy.$(PSUFFIX) : imatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cimatcopy.$(SUFFIX) cimatcopy.$(PSUFFIX) : zimatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_cimatcopy.$(SUFFIX) cblas_cimatcopy.$(PSUFFIX) : zimatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c - $(CC) -c $(CFLAGS) $< -o $(@F) - -cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) - -cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c - $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) +install : + $(MAKE) -f Makefile.install install +clean :: + @for d in $(SUBDIRS_ALL) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +#ifdef DYNAMIC_ARCH + @$(MAKE) -C kernel clean +#endif + @$(MAKE) -C reference clean + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 +ifeq ($(OSNAME), Darwin) + @rm -rf getarch.dSYM getarch_2nd.dSYM +endif + @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f cblas.tmp cblas.tmp2 + @touch $(NETLIB_LAPACK_DIR)/make.inc + @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean + @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h + @$(MAKE) -C relapack clean + @rm -f *.grd Makefile.conf_last config_last.h + @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) + @echo Done. From a8f249458de25e3dfcde1826a2a8c746270db398 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 00:29:38 +0100 Subject: [PATCH 062/681] Build CBLAS interfaces for CROTG and ZROTG as well --- interface/Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 1a440c9c3..fab403c82 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) CZBLAS2OBJS = \ @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) From 25c986db5ac17cfacf5c12469545ab7ad64c5af9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 00:30:27 +0100 Subject: [PATCH 063/681] Add prototypes for CBLAS_CROTG and CBLAS_ZROTG --- cblas.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cblas.h b/cblas.h index 8aafdb186..f0220eb99 100644 --- a/cblas.h +++ b/cblas.h @@ -130,6 +130,9 @@ void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONS void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_crotg(void *a, void *b, float *c, void *s); +void cblas_zrotg(void *a, void *b, double *c, void *s); + void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); From da8d7f09f18efc3101dfac4e1ef1c9413a15f71b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 09:46:53 +0100 Subject: [PATCH 064/681] try to work around gcc update problems --- .github/workflows/nightly-Homebrew-build.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 8d7cfea2d..b025f8634 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -44,6 +44,11 @@ jobs: if: github.event_name != 'pull_request' run: brew update || true + - name: unlink installed gcc to allow updating + run: | + brew unlink gcc@8 + brew unlink gcc@9 + - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas From 89ae305e11dacb4622f58b03e48b4bb361acf94c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 12:30:26 +0100 Subject: [PATCH 065/681] Workaround for cmake having its own C_COMPILER variable --- cmake/utils.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 8f25c1b27..29b5a067b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") + if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) + set (CMAKE_MATCH_1 CMAKE_C_COMPILER) + endif () if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) # message (STATUS "condition is true") set (IfElse 1) From 6bbe6d5b9203c92394463b8a96fd4995db73d9f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Jan 2021 22:36:04 +0100 Subject: [PATCH 066/681] Make compile-time BUFFERSIZE setting actually reach the compiler/preprocessor --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index ca0879fe6..abc2c3dc5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1279,6 +1279,10 @@ CCOMMON_OPT += -DUSE_PAPI EXTRALIB += -lpapi -lperfctr endif +ifdef BUFFERSIZE +CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) +endif + ifdef DYNAMIC_THREADS CCOMMON_OPT += -DDYNAMIC_THREADS endif From e3f40636839eddd79fe1260010464dd7fe03e772 Mon Sep 17 00:00:00 2001 From: Albert Ziegenhagel Date: Thu, 14 Jan 2021 10:00:49 +0100 Subject: [PATCH 067/681] Fix building "generic" TRMM kernel with CMake The CMake "TARGET_CORE" variables stores the "generic" target name in all lowercase letters, but gets compared to an all uppercase string, which results in the wrong TRMM kernel being selected. This commit converts the TARGET_CORE to all uppercase before comparing its value to make sure case mismatches are not an issue in the future anymore. --- kernel/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6d8d759ad..f0793bdef 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) + string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) + if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () From 202fc9e8ed509224761e9c310e3ca0b5a3346134 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Thu, 14 Jan 2021 19:40:31 -0700 Subject: [PATCH 068/681] Fix uninitialized argument value in dasum_k --- kernel/x86_64/dasum.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index ddec21383..534f257d2 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #if defined(SMP) int nthreads; FLOAT dummy_alpha; - FLOAT * dummy_b; #endif FLOAT sumf = 0.0; @@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); From 6f32991eae430b37137f4635d7627b5fecbd24c7 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Thu, 14 Jan 2021 19:40:31 -0700 Subject: [PATCH 069/681] Don't define the mode variable when not needed in gemm functions --- interface/gemm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 860e588fe..6fde69049 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; @@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); +#endif MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) From f1bf2603e6435202b600fb8e7dab3e4d124acb61 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Thu, 14 Jan 2021 19:40:32 -0700 Subject: [PATCH 070/681] Remove dead assignment to dflag in rotmg functions --- interface/rotmg.c | 1 - 1 file changed, 1 deletion(-) diff --git a/interface/rotmg.c b/interface/rotmg.c index ce3b146c1..3a5ca8f95 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dq1 = dp1 * *dx1; if(ABS(dq1) > ABS(dq2)) { - dflag = ZERO; dh11 = ONE; dh22 = ONE; dh21 = - dy1 / *dx1; From eff7c9166ecea213b99384ea8923ea08d7445398 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 15 Jan 2021 13:40:34 -0600 Subject: [PATCH 071/681] Optimize cdot function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/KERNEL.POWER10 | 4 - kernel/power/cdot.c | 8 ++ kernel/power/cdot_microk_power10.c | 177 +++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 4 deletions(-) create mode 100644 kernel/power/cdot_microk_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index d61f5194a..1cf7b0b7c 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c SDOTKERNEL = sdot_power10.c DDOTKERNEL = ddot_power10.c DSDOTKERNEL = sdot_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CDOTKERNEL = cdot_power9.S -else CDOTKERNEL = cdot.c -endif ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index ef5e4710f..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" +#if defined(POWER10) +#include "cdot_microk_power10.c" +#else #ifndef HAVE_KERNEL_8 #include @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) } #endif +#endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { +#if defined(POWER10) + BLASLONG n1 = n & -16; +#else BLASLONG n1 = n & -8; +#endif BLASLONG j=0; if (n1){ diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c new file mode 100644 index 000000000..399f2b180 --- /dev/null +++ b/kernel/power/cdot_microk_power10.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void cdot_kernel_8 (long n, float *x, float *y, float *dot) +{ + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 48, 0(%3) \n\t" + + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 50, 32(%3) \n\t" + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 40, 0(%2) \n\t" + + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 42, 32(%2) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 52, 64(%3) \n\t" + + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 54, 96(%3) \n\t" + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 44, 64(%2) \n\t" + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 46, 96(%2) \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 33, 33, 35 \n\t" + "xvaddsp 37, 37, 39 \n\t" + + "xvaddsp 35, 32, 36 \n\t" + "xvaddsp 34, 33, 37 \n\t" + "xxswapd 32, 35 \n\t" + "xxswapd 33, 34 \n\t" + "xvaddsp 35, 35, 32 \n\t" + "xvaddsp 34, 34, 33 \n\t" + "xxpermdi 34, 34, 35, 2 \n\t" + "stxv 34, 0(%6) \n\t" + + "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" + : + "=m" (*dot), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y), + "b" (dot), // 6 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} From b60de4447a5dd279af79276ae42dd7fa9ae8578f Mon Sep 17 00:00:00 2001 From: xoviat Date: Tue, 19 Jan 2021 08:57:44 -0600 Subject: [PATCH 072/681] add cortex-m platform --- .gitignore | 1 + README.md | 3 ++- cmake/system.cmake | 5 +++++ common.h | 13 +++++++++++-- driver/others/memory.c | 6 ++++-- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index bca79f043..8674c4536 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ benchmark/*.goto benchmark/smallscaling CMakeCache.txt CMakeFiles/* +.vscode diff --git a/README.md b/README.md index fed3936ee..174f951f4 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,8 @@ Please note that it is not possible to combine support for different architectur - **Android**: Supported by the community. Please read . - **AIX**: Supported on PPC up to POWER8 - **Haiku**: Supported by the community. We don't actively test the library on this OS. -- **SunOS**: Supported by the community. We don't actively test the library on this OS: +- **SunOS**: Supported by the community. We don't actively test the library on this OS. +- **Cortex-M**: Supported by the community. Please read . ## Usage diff --git a/cmake/system.cmake b/cmake/system.cmake index 66e95c6d3..869cc62da 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -233,6 +233,11 @@ if (BINARY64) endif () endif () +if(EMBEDDED) + set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") + set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") +endif() + if (NEED_PIC) if (${CMAKE_C_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") diff --git a/common.h b/common.h index 2825407cb..862e0b4db 100644 --- a/common.h +++ b/common.h @@ -122,7 +122,7 @@ extern "C" { #define ATOM GOTO_ATOM #undef GOTO_ATOM #endif -#else +#elif !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -134,6 +134,9 @@ extern "C" { #if defined(SMP) || defined(USE_LOCKING) #include #endif +#else +#include +#include #endif #if defined(OS_SUNOS) @@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; -#else +#elif !defined(OS_EMBEDDED) struct timeval tv; gettimeofday(&tv,NULL); return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#else + return 0; #endif } #define RPCC_DEFINED @@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ #include "common_linux.h" #endif +#ifdef OS_EMBEDDED +#define DTB_DEFAULT_ENTRIES 64 +#endif + #define MMAP_ACCESS (PROT_READ | PROT_WRITE) #ifdef __NetBSD__ diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..2fb1f1f73 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1668,16 +1668,18 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#else +#elif !defined(OS_EMBEDDED) #define ALLOC_MMAP #define ALLOC_MALLOC +#else +#define ALLOC_MALLOC #endif #include #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include From 63fa3c3f8f869c585d8c5aef6f580a967b64405c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jan 2021 15:41:04 +0100 Subject: [PATCH 073/681] Require gcc 11 for builtin_cpu_is(power10) fixes #3074 --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f9feeb6e8..18f16f835 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -202,7 +202,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ -#if (!defined __GNUC__) || ( __GNUC__ >= 6) +#if (!defined __GNUC__) || ( __GNUC__ >= 11) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; #endif From 0b9e4d127881d1efcd10ac64bf5bc2b3af9666f7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jan 2021 18:30:05 +0100 Subject: [PATCH 074/681] Add gcc10/arm64 DYNAMIC_ARCH build --- .drone.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.drone.yml b/.drone.yml index b1c211d14..1bdeb8cdf 100644 --- a/.drone.yml +++ b/.drone.yml @@ -190,3 +190,25 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: arm64_gcc10 + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:20.04 + environment: + CC: gcc-10 + COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran-10 perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + From 6178974cd9dfe5bde1c36c05ad87834a5c1a9ce9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jan 2021 20:21:27 +0100 Subject: [PATCH 075/681] Update .drone.yml --- .drone.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.drone.yml b/.drone.yml index 1bdeb8cdf..38ded2015 100644 --- a/.drone.yml +++ b/.drone.yml @@ -203,6 +203,7 @@ steps: image: ubuntu:20.04 environment: CC: gcc-10 + FC: gfortran-10 COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' commands: - echo "MAKE_FLAGS:= $COMMON_FLAGS" @@ -210,5 +211,6 @@ steps: - apt-get install -y make $CC gfortran-10 perl python g++ - $CC --version - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C utest $COMMON_FLAGS - make -C test $COMMON_FLAGS From b94dab5250469d4d30d1a21bf0e0b78eea3cf286 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Jan 2021 21:34:36 +0100 Subject: [PATCH 076/681] patch to support power10 in builtin_cpu_is was backported to gcc 10.2, so allow that as wel --- driver/others/dynamic_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 18f16f835..b8e5840a3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -202,7 +202,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER10; #endif /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) if (__builtin_cpu_is("power10")) return &gotoblas_POWER9; #endif From 439b93f6d285fa29dba71a61df7bb8cf32fe0971 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 21 Jan 2021 13:24:45 -0600 Subject: [PATCH 077/681] Optimize s/drot function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/drot.c | 22 ++++- kernel/power/drot_microk_power10.c | 148 ++++++++++++++++++++++++++++ kernel/power/srot.c | 22 ++++- kernel/power/srot_microk_power10.c | 151 +++++++++++++++++++++++++++++ 4 files changed, 341 insertions(+), 2 deletions(-) create mode 100644 kernel/power/drot_microk_power10.c create mode 100644 kernel/power/srot_microk_power10.c diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 951c2f9c9..94d9d95a3 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" +#elif defined(POWER10) +#include "drot_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + drot_kernel_16(n1,&x[i], &y[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { drot_kernel_16(n1, x1, y1, c, s); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/drot_microk_power10.c b/kernel/power/drot_microk_power10.c new file mode 100644 index 000000000..e34e745c7 --- /dev/null +++ b/kernel/power/drot_microk_power10.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void drot_kernel_16 (long n, double *x, double *y, double c, double s) +{ + __asm__ + ( + XXSPLTD_S(36,%x5,0) // load c to both dwords + XXSPLTD_S(37,%x6,0) // load s to both dwords + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "d" (c), // 5 + "d" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/srot.c b/kernel/power/srot.c index a53342f61..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" +#elif defined(POWER10) +#include "srot_microk_power10.c" #endif #endif @@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + srot_kernel_16(n1, &x1[i], &y1[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { @@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; } +#endif while(i < n) { temp = c*x[i] + s*y[i] ; diff --git a/kernel/power/srot_microk_power10.c b/kernel/power/srot_microk_power10.c new file mode 100644 index 000000000..c54c30742 --- /dev/null +++ b/kernel/power/srot_microk_power10.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void srot_kernel_16 (long n, float *x, float *y, float c, float s) +{ + __asm__ + ( + "xscvdpspn 36, %x5 \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + + "xscvdpspn 37, %x6 \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "f" (c), // 5 + "f" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} From 2e8d6e869030843fec421831ed6388d84915c7c7 Mon Sep 17 00:00:00 2001 From: xoviat Date: Sat, 23 Jan 2021 22:12:17 -0600 Subject: [PATCH 078/681] add functions for embedded --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2fb1f1f73..b430fd5c1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1673,6 +1673,11 @@ void gotoblas_dummy_for_PGI(void) { #define ALLOC_MALLOC #else #define ALLOC_MALLOC + +inline int puts(const char *str) { return 0; } +inline int printf(const char *format, ...) { return 0; } +inline char *getenv(const char *name) { return ""; } +inline int atoi(const char *str) { return 0; } #endif #include From 3ede843d509a95b0d63f58484ab8977cb2ddc39f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 24 Jan 2021 07:48:28 -0600 Subject: [PATCH 079/681] Optimize s/dscal function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/dscal.c | 36 +++++++- kernel/power/dscal_microk_power10.c | 134 +++++++++++++++++++++++++++ kernel/power/sscal.c | 36 +++++++- kernel/power/sscal_microk_power10.c | 135 ++++++++++++++++++++++++++++ 4 files changed, 339 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dscal_microk_power10.c create mode 100644 kernel/power/sscal_microk_power10.c diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 39293252b..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" +#elif defined(POWER10) +#include "dscal_microk_power10.c" #endif #endif @@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/dscal_microk_power10.c b/kernel/power/dscal_microk_power10.c new file mode 100644 index 000000000..d0d506f24 --- /dev/null +++ b/kernel/power/dscal_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8 (long n, double *x, double alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + XXSPLTD_S(48,%x3,0) + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "d" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void dscal_kernel_8_zero (long n, double *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index de37e10a5..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" +#elif defined(POWER10) +#include "sscal_microk_power10.c" #endif #endif @@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c new file mode 100644 index 000000000..a523a1675 --- /dev/null +++ b/kernel/power/sscal_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16 (long n, float *x, float alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xscvdpspn 48, %x3 \n\t" + "xxspltw 48, 48, 0 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void sscal_kernel_16_zero (long n, float *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} From 9b2d69aa80b72f9958860a5e8bcadb89f0e81045 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Jan 2021 23:18:01 +0100 Subject: [PATCH 080/681] Add DYNAMIC_LIST option for ARM64 --- Makefile.system | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.system b/Makefile.system index abc2c3dc5..848c38797 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +endif endif ifeq ($(ARCH), mips64) From deb2e66bcce70c64b1e1d82612b24191563dedb5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Jan 2021 23:18:52 +0100 Subject: [PATCH 081/681] Add DYNAMIC_LIST support for ARM64 --- driver/others/dynamic_arm64.c | 80 ++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 37c0694b6..a86a95890 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -41,8 +41,75 @@ #include #include #endif +#ifdef OS_DARWIN +#include +int32_t value; +size_t length=sizeof(value); +#endif extern gotoblas_t gotoblas_ARMV8; +#ifdef DYNAMIC_LIST +#ifdef DYN_CORTEXA53 +extern gotoblas_t gotoblas_CORTEXA53; +#else +#define gotoblas_CORTEXA53 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA57 +extern gotoblas_t gotoblas_CORTEXA57; +#else +#define gotoblas_CORTEXA57 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA72 +extern gotoblas_t gotoblas_CORTEXA72; +#else +#define gotoblas_CORTEXA72 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA73 +extern gotoblas_t gotoblas_CORTEXA73; +#else +#define gotoblas_CORTEXA73 gotoblas_ARMV8 +#endif +#ifdef DYN_FALKOR +extern gotoblas_t gotoblas_FALKOR; +#else +#define gotoblas_FALKOR gotoblas_ARMV8 +#endif +#ifdef DYN_TSV110 +extern gotoblas_t gotoblas_TSV110; +#else +#define gotoblas_TSV110 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX +extern gotoblas_t gotoblas_THUNDERX; +#else +#define gotoblas_THUNDERX gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX2T99 +extern gotoblas_t gotoblas_THUNDERX2T99; +#else +#define gotoblas_THUNDERX2T99 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX3T110 +extern gotoblas_t gotoblas_THUNDERX3T110; +#else +#define gotoblas_THUNDERX3T110 gotoblas_ARMV8 +#endif +#ifdef DYN_EMAG8180 +extern gotoblas_t gotoblas_EMAG8180; +#else +#define gotoblas_EMAG8180 gotoblas_ARMV8 +#endif +#ifdef DYN_NEOVERSEN1 +extern gotoblas_t gotoblas_NEOVERSEN1; +#else +#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#endif +#ifdef DYN_VORTEX +extern gotoblas_t gotoblas_VORTEX; +#else +#define gotoblas_VORTEX gotoblas_ARMV8 +#endif +#else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; extern gotoblas_t gotoblas_CORTEXA72; @@ -54,10 +121,12 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +extern gotoblas_t gotoblas_VORTEX; +#endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 12 +#define NUM_CORETYPES 13 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -68,7 +137,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__ __volatile__("mrs %0, "#id : "=r" (var)); \ + __asm__ ("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { @@ -84,6 +153,7 @@ static char *corename[] = { "emag8180", "neoversen1", "thunderx3t110", + "vortex", "unknown" }; @@ -100,6 +170,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; + if (gotoblas == &gotoblas_VORTEX) return corename[12]; return corename[NUM_CORETYPES]; } @@ -131,6 +202,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); + case 12: return (&gotoblas_VORTEX); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -142,6 +214,10 @@ static gotoblas_t *get_coretype(void) { char coremsg[128]; #if (!defined OS_LINUX && !defined OS_ANDROID) +#ifdef DARWIN + sysctlbyname("hw.cpufamily",&value,&length,NULL,0); + if (value ==131287967) return CPU_VORTEX; +#endif return NULL; #else From 113840da12828418dedeb1392d55e45ae6a2a674 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Sun, 24 Jan 2021 22:20:44 -0700 Subject: [PATCH 082/681] Fix null pointer check in blas_memory_alloc --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..91d21a88e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); From cb61d3b46bb65787bff8452cd384e047c2f5687d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Jan 2021 13:13:20 +0100 Subject: [PATCH 083/681] Add DYNAMIC_LIST support for ARM64 --- cmake/arch.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5457bfb07..4451f9eaa 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,6 +45,9 @@ endif () if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (DYNAMIC_LIST) + set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) + endif () endif () if (POWER) From 0cb9e9fc8d5b56eb0db42136dd8268671438ad27 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Jan 2021 19:02:21 +0100 Subject: [PATCH 084/681] Remove the VORTEX support bits again for now --- driver/others/dynamic_arm64.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index a86a95890..6c68ba98a 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -41,11 +41,6 @@ #include #include #endif -#ifdef OS_DARWIN -#include -int32_t value; -size_t length=sizeof(value); -#endif extern gotoblas_t gotoblas_ARMV8; #ifdef DYNAMIC_LIST @@ -104,11 +99,6 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #else #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 #endif -#ifdef DYN_VORTEX -extern gotoblas_t gotoblas_VORTEX; -#else -#define gotoblas_VORTEX gotoblas_ARMV8 -#endif #else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; @@ -121,12 +111,11 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; -extern gotoblas_t gotoblas_VORTEX; #endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -153,7 +142,6 @@ static char *corename[] = { "emag8180", "neoversen1", "thunderx3t110", - "vortex", "unknown" }; @@ -170,7 +158,6 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; - if (gotoblas == &gotoblas_VORTEX) return corename[12]; return corename[NUM_CORETYPES]; } @@ -202,7 +189,6 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); - case 12: return (&gotoblas_VORTEX); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -214,10 +200,6 @@ static gotoblas_t *get_coretype(void) { char coremsg[128]; #if (!defined OS_LINUX && !defined OS_ANDROID) -#ifdef DARWIN - sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_VORTEX; -#endif return NULL; #else From 856bc365338f7559639f341d76ca8746d1628ee5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Jan 2021 13:41:45 +0100 Subject: [PATCH 085/681] Add exceptional shift to fix rare convergence problems --- lapack-netlib/SRC/chgeqz.f | 10 ++++++++-- lapack-netlib/SRC/zhgeqz.f | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 73d35621c..1616840ec 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -743,8 +743,14 @@ * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b51cba4f7..b21199e9e 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -744,8 +744,14 @@ * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * From 3165c915b6a0cd8f5104cd012d3189bdef206d63 Mon Sep 17 00:00:00 2001 From: xoviat Date: Wed, 27 Jan 2021 15:24:49 -0600 Subject: [PATCH 086/681] fix test helpers --- CMakeLists.txt | 2 +- lapack-netlib/TESTING/CMakeLists.txt | 198 ++++++++++++++------------- test/CMakeLists.txt | 4 +- 3 files changed, 109 insertions(+), 95 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5ba3ceed..9c992a08b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,7 +229,7 @@ if (NOT NO_CBLAS) add_subdirectory(utest) endif() -if (NOT MSVC AND NOT NOFORTRAN) +if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) if(NOT NO_CBLAS) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index 80e6b3232..b4e2223f7 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND) endif() - +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() # $1 exec, $2 input, $3 output_result FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" @@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) - +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() add_test(NAME "REAL_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" ) add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" ) add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" ) add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" ) add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" ) # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" ) # ======== SINGLE RFP LIN TESTS ======================== add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" ) # # @@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" # add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" ) add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" ) add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" ) add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" ) add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" ) add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" ) add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" ) add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" ) add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" ) add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" ) add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" ) add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" ) add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" ) add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" ) add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" ) add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" ) add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" ) add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" ) add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" ) add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" ) # ======== COMPLEX EIG TESTS =========================== add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" ) add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" ) add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" ) add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" ) add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" ) add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" ) add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" ) add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" ) add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" ) add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" ) add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" ) add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" ) add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" ) add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" ) add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" ) add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" ) add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" ) add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" ) add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" ) add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" ) # ======== DOUBLE EIG TESTS =========================== add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" ) add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" ) add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" ) add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" ) add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" ) add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" ) add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" ) add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" ) add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" ) add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" ) add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" ) add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" ) add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" ) add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" ) add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" ) add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" ) add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" ) add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" ) add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" ) add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" ) # ======== COMPLEX16 EIG TESTS =========================== add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" ) add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" ) add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" ) add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" ) add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" ) add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" ) add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" ) add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" ) add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" ) add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" ) add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" ) add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" ) add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" ) add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" ) add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" ) add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" ) add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" ) add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" ) add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" ) add_test(NAME "Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" ) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ccd1175a3..d338242ff 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,10 +24,10 @@ endforeach() # $1 exec, $2 input, $3 output_result if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 -"Remove-Item -Force $args[2]\n" +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" "$ErrorActionPreference = \"Stop\"\n" "Get-Content $args[1] | & $args[0]\n" -"If (Get-Content $args[2] | %{$_ -match \"FATAL\"}) {\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" "echo Error\n" "exit 1\n" "} else {\n" From 3dfecaaf7cd86932117870dc6764a38c4006ed85 Mon Sep 17 00:00:00 2001 From: xoviat Date: Wed, 27 Jan 2021 16:39:15 -0600 Subject: [PATCH 087/681] require nofortran to be set on msvc --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c992a08b..4f34d5337 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,9 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) +if(MSVC AND NOT DEFINED NOFORTRAN) + set(NOFORTRAN ON) +endif() ####### if(MSVC) From 609ea8027632bbe878be8e5db08be08996062732 Mon Sep 17 00:00:00 2001 From: xoviat Date: Wed, 27 Jan 2021 16:39:52 -0600 Subject: [PATCH 088/681] enable testing --- appveyor.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 1936059d5..a18a41960 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -76,7 +76,5 @@ build_script: - cmake --build . test_script: - - echo Running Test - - cd utest - - openblas_utest + - ctest -j2 From f87842483eee9d158f44d51d4c09662c3cff7526 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Jan 2021 09:56:12 +0100 Subject: [PATCH 089/681] fix calculation of non-exceptional shift (from Reference-LAPACK PR 477) --- lapack-netlib/SRC/chgeqz.f | 27 +++++++++++++++++---------- lapack-netlib/SRC/zhgeqz.f | 27 +++++++++++++++++---------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 1616840ec..0d3787915 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -320,12 +320,13 @@ $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL LSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -729,15 +730,21 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + - $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( DBLE( X / TEMP2 )*DBLE( Y )+ + $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) END IF ELSE * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b21199e9e..b28ae47a4 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -320,12 +320,13 @@ $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX*16 ZLADIV LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANHS - EXTERNAL LSAME, DLAMCH, ZLANHS + EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL @@ -730,15 +731,21 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + - $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( DBLE( X / TEMP2 )*DBLE( Y )+ + $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) END IF ELSE * From c4b5abbe43d7c22215ef36ef4f7c1413c975678c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Jan 2021 10:45:36 +0100 Subject: [PATCH 090/681] fix data type --- lapack-netlib/SRC/chgeqz.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 0d3787915..4725e7169 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -741,8 +741,8 @@ TEMP = MAX( TEMP, ABS1( X ) ) Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) IF( TEMP2.GT.ZERO ) THEN - IF( DBLE( X / TEMP2 )*DBLE( Y )+ - $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y + IF( REAL( X / TEMP2 )*REAL( Y )+ + $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y END IF SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) END IF From 2056ffc227d85c5a72622baae26427493c5b0bbc Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 29 Jan 2021 13:51:43 -0600 Subject: [PATCH 091/681] Optimize cscal function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/cscal_microk_power10.c | 176 ++++++++++++++++++++++++++++ kernel/power/zscal.c | 12 +- 2 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 kernel/power/cscal_microk_power10.c diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c new file mode 100644 index 000000000..70b50809e --- /dev/null +++ b/kernel/power/cscal_microk_power10.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) +{ + __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + __asm__ + ( + "dcbt 0, %2 \n\t" + "xscvdpspn 32, %x3 \n\t" + "xxspltw 32, 32, 0 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + + "lxvp 40, 128(%2) \n\t" + + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + + "lxvp 42, 160(%2) \n\t" + + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + + "lxvp 44, 192(%2) \n\t" + + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "lxvp 46, 224(%2) \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%3,%4)\n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha_r), // 3 + "wa" (t0), // 4 + "wa" (mask) // 5 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 5526f4d67..31b3682b9 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif +#elif defined(POWER10) +#if defined(DOUBLE) +#include "zscal_microk_power8.c" +#else +#include "cscal_microk_power10.c" +#endif #endif #endif @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { +#if defined(DOUBLE) n1 = n & -8; +#else + n1 = n & -16; +#endif if ( n1 > 0 ) { zscal_kernel_8(n1, x, da_r, da_i); From bd906e341005fc0bf460ebcf3f6d31433ecef0be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Jan 2021 16:46:25 +0100 Subject: [PATCH 092/681] fix copy-paste error in build rules for cblas_crotg and cblas_zrotg --- interface/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index fab403c82..3252601d2 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1634,10 +1634,10 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c +cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) -cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c +cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c From 6fa9860dbe39757cd42c472dc0f2e00a552355b7 Mon Sep 17 00:00:00 2001 From: xoviat <49173759+xoviat@users.noreply.github.com> Date: Sat, 30 Jan 2021 21:28:12 -0600 Subject: [PATCH 093/681] appveyor: cleanup and add openmp run --- appveyor.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 1936059d5..1db95d220 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,10 +30,11 @@ environment: CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 matrix: - COMPILER: clang-cl - WITH_FORTRAN: yes + WITH_FORTRAN: ON + USE_OPENMP: ON - COMPILER: clang-cl DYNAMIC_ARCH: ON - WITH_FORTRAN: no + WITH_FORTRAN: OFF - COMPILER: cl - COMPILER: MinGW64-gcc-7.2.0-mingw DYNAMIC_ARCH: OFF @@ -47,12 +48,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake - - - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang - + - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" @@ -68,8 +64,9 @@ before_build: - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: From eb1d2344f7809c63a9cb5ae4ce05e255b15ec2c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 1 Feb 2021 19:45:25 +0100 Subject: [PATCH 094/681] Fix compiler version check for Intel Cooperlake support (clang-cl does not accept -dumpversion) --- cmake/system.cmake | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 66e95c6d3..1d4e62463 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -148,16 +148,20 @@ endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.1 OR ${CMAKE_C_COMPILER_VERSION} VERSION_EQUAL 10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() + endif() endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 774b9f86534c74403cfb417cde906dd034cd707e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 1 Feb 2021 20:18:53 +0100 Subject: [PATCH 095/681] handle AppleClang in Cooperlake support condition --- cmake/system.cmake | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1d4e62463..1336e19a2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -4,6 +4,13 @@ ## set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") +1 + +## + +2 + +## Author: Hank Anderson # System detection, via CMake. include("${PROJECT_SOURCE_DIR}/cmake/system_check.cmake") @@ -150,12 +157,12 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.1 OR ${CMAKE_C_COMPILER_VERSION} VERSION_EQUAL 10.1) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() From 99ac042702da18bcf7627c410c4d7eb36213bd6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 1 Feb 2021 21:02:53 +0100 Subject: [PATCH 096/681] remove spurious lines (probably editor malfunction) --- cmake/system.cmake | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1336e19a2..a5996b9be 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -4,13 +4,6 @@ ## set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib") -1 - -## - -2 - -## Author: Hank Anderson # System detection, via CMake. include("${PROJECT_SOURCE_DIR}/cmake/system_check.cmake") From 95e19e2e231b01f104e0acc68bfd7589c39c4213 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Feb 2021 10:53:46 +0100 Subject: [PATCH 097/681] fix case in compiler name check Co-authored-by: xoviat <49173759+xoviat@users.noreply.github.com> --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index a5996b9be..d52af3aa1 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -155,7 +155,7 @@ if (DEFINED TARGET) else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() - elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() From d7a77091a3468354ac57ee76a682c04ac9c5ad03 Mon Sep 17 00:00:00 2001 From: Jake Arkinstall <65358059+jake-arkinstall@users.noreply.github.com> Date: Wed, 10 Feb 2021 12:11:17 +0000 Subject: [PATCH 098/681] Addressed issue #3100, removing an unnecessary write to the include directory --- cmake/lapacke.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f10905c4d..54a583887 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC}) endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") -configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") From ece3ce581e3ec530eaccfe7f284c52e115ec7aa9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Feb 2021 14:22:59 +0100 Subject: [PATCH 099/681] Strip parenthesized (pkgversion) data from GCC version string to avoid misinterpretation --- f_check | 1 + 1 file changed, 1 insertion(+) diff --git a/f_check b/f_check index e9aca4ff9..ffe9c6b46 100644 --- a/f_check +++ b/f_check @@ -75,6 +75,7 @@ if ($compiler eq "") { } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { + $data =~ s/\(+.*?\)+//g; $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; From db348dcff2b3267e40de634bda9173370dd6b001 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 09:23:05 +0100 Subject: [PATCH 100/681] Enable optimized srot/drot kernels from Haswell --- kernel/x86_64/KERNEL.ZEN | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 7bb308fea..a66394be3 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SROTKERNEL = srot.c +DROTKERNEL = drot.c From 46509953a9dd1907f05465e2212d4477cb26b14c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 09:24:16 +0100 Subject: [PATCH 101/681] Use Haswell optimizations for Zen as well --- kernel/x86_64/drot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index 66e9ff907..ab5048bd1 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -2,7 +2,7 @@ #if defined(SKYLAKEX) #include "drot_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "drot_microk_haswell-2.c" #endif From 950c047b49c159fd8a8804ecae351cccc2865d02 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 09:24:51 +0100 Subject: [PATCH 102/681] Use Haswell optimizations for Zen as well --- kernel/x86_64/srot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 3264d251a..587cf8e40 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -2,7 +2,7 @@ #if defined(SKYLAKEX) #include "srot_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "srot_microk_haswell-2.c" #endif From ce7ddd8921fa784079face668eab93c778623cac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 09:25:36 +0100 Subject: [PATCH 103/681] Use Haswell optimizations for Zen as well --- kernel/x86_64/sasum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index d0cea9bee..a021741c7 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -11,7 +11,7 @@ #if defined(SKYLAKEX) #include "sasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "sasum_microk_haswell-2.c" #endif From 47691c031fa128ed65f630dd009a943465a2d92f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 09:26:15 +0100 Subject: [PATCH 104/681] Use Haswell optimizations for Zen as well --- kernel/x86_64/dasum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 534f257d2..8af9e798b 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -6,7 +6,7 @@ #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "dasum_microk_haswell-2.c" #endif From ae53e3e23343739e61439e39cbcac1f0d684b134 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 20:16:27 +0100 Subject: [PATCH 105/681] Recognize Intel Tiger Lake as SkylakeX --- cpuid_x86.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index aca37da45..44704fcd9 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1418,6 +1418,15 @@ int get_cpuname(void){ case 9: case 8: switch (model) { + case 12: // Tiger Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; @@ -2124,6 +2133,16 @@ int get_coretype(void){ break; case 9: case 8: + if (model == 12) { // Tiger Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } if (model == 14) { // Kaby Lake if(support_avx()) #ifndef NO_AVX2 From e4e5042e3859583387eb43c143c57bab671002a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Feb 2021 20:17:11 +0100 Subject: [PATCH 106/681] Recognize Intel Tiger Lake as SkylakeX --- driver/others/dynamic.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7845d6951..158e1b3da 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){ return NULL; case 9: case 8: + if (model == 12) { // Tiger Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; From 63fa6c832ea142ecac3c61e2ce542949ae8ccdcb Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 11 Feb 2021 21:28:03 -0600 Subject: [PATCH 107/681] Fix build issue on POWER8 with DYNAMIC_ARCH Running make DYNAMIC_ARCH=1 on POWER 8 BE with gcc10.2 version, gives the following error due to the difference in UNROLL_M/N. 'No rule to make target 'dgemm_incopy_POWER10.o', needed by kernel' --- param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/param.h b/param.h index 6a790ab61..9ba25de6a 100644 --- a/param.h +++ b/param.h @@ -2443,8 +2443,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 From b0bded3f2f3da67a1e8ac1ab10a04a73838a13cd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 Feb 2021 11:14:05 +0100 Subject: [PATCH 108/681] Fix get_num_procs() in the USE_TLS branch for non-glibc systems --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 0d4b2ff31..75203a7b0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) From dbbf92c1d120c22c0ce7d5b8e1d7ec35f9bace34 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Thu, 18 Feb 2021 13:46:50 -0500 Subject: [PATCH 109/681] Fix race in blas_thread_shutdown. blas_server_avail was read without holding server_lock. If multiple threads call blas_thread_shutdown simultaneously, for example, by calling fork(), then they can attempt to shut down multiple times. This can lead to a segmentation fault. --- driver/others/blas_server.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 5e0943c2e..fa07a1ea4 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ int i; - if (!blas_server_avail) return 0; - LOCK_COMMAND(&server_lock); - for (i = 0; i < blas_num_threads - 1; i++) { + if (blas_server_avail) { + for (i = 0; i < blas_num_threads - 1; i++) { - pthread_mutex_lock (&thread_status[i].lock); - atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); - thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); + pthread_mutex_lock (&thread_status[i].lock); - pthread_mutex_unlock(&thread_status[i].lock); + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); + thread_status[i].status = THREAD_STATUS_WAKEUP; + pthread_cond_signal (&thread_status[i].wakeup); - } + pthread_mutex_unlock(&thread_status[i].lock); - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_join(blas_threads[i], NULL); - } + } - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_mutex_destroy(&thread_status[i].lock); - pthread_cond_destroy (&thread_status[i].wakeup); - } + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_join(blas_threads[i], NULL); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_mutex_destroy(&thread_status[i].lock); + pthread_cond_destroy (&thread_status[i].wakeup); + } #ifdef NEED_STACKATTR - pthread_attr_destory(&attr); + pthread_attr_destroy(&attr); #endif - blas_server_avail = 0; + blas_server_avail = 0; + } UNLOCK_COMMAND(&server_lock); return 0; From 1a3ad4b670e2d8b28ce8616202970c3b6359e407 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Feb 2021 19:40:36 +0100 Subject: [PATCH 110/681] Fix signatures of the TLS-mode dll_callback and p_process_term functions for Win64 --- driver/others/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 75203a7b0..63fa6a566 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1619,10 +1619,12 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; + #ifdef _WIN64 +static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif -static int(*p_process_term)(void) = on_process_term; + #ifdef _WIN64 +static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else +static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif From b1eed27a542019a102f97647aa77a219a5124783 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 22 Feb 2021 21:35:42 +0100 Subject: [PATCH 111/681] Replace naive omatcopy_rt with 4x4 blocked implementation as suggested by MigMuc in issue 2532 --- kernel/arm/omatcopy_rt.c | 224 ++++++++++++++++++++++++++++++++++----- 1 file changed, 198 insertions(+), 26 deletions(-) diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c index 9d58350d5..d6a3df619 100644 --- a/kernel/arm/omatcopy_rt.c +++ b/kernel/arm/omatcopy_rt.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +Copyright (c) 2021, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/***************************************************** - * 2014/06/09 Saar - * - * Order rowMajor - * Trans - * -******************************************************/ - int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) -{ - BLASLONG i,j; - FLOAT *aptr,*bptr; - if ( rows <= 0 ) return(0); - if ( cols <= 0 ) return(0); + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; - aptr = a; + if (rows <= 0) return 0; + if (cols <= 0) return 0; - for ( i=0; i> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } -} + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} From 0a4546b742104580cee77fe8f01d9cbb20d4161b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 23 Feb 2021 13:14:35 +0100 Subject: [PATCH 112/681] Typo fix --- kernel/arm/omatcopy_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c index d6a3df619..3d90ac6e4 100644 --- a/kernel/arm/omatcopy_rt.c +++ b/kernel/arm/omatcopy_rt.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) - +{ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; From cceeee7806a6647ef06044fd74c4349565eeb1f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Feb 2021 09:00:54 +0100 Subject: [PATCH 113/681] Add optimized omatcopy_rt --- kernel/x86_64/omatcopy_rt.c | 371 ++++++++++++++++++++++++++++++++++++ 1 file changed, 371 insertions(+) create mode 100644 kernel/x86_64/omatcopy_rt.c diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c new file mode 100644 index 000000000..ac25ea74b --- /dev/null +++ b/kernel/x86_64/omatcopy_rt.c @@ -0,0 +1,371 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifdef HAVE_AVX + +/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ +/* m: %5 = num_rows, %6 = alpha */ +/* xmm15 = alpha */ +#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ + "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ + "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ + "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" + +#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ + "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ + "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ + "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" + +#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_4x16 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) + +#define COPY_4x8 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) + +#define COPY_4x4 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) + +#define COPY_4x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ + "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ + "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" + +#define COPY_4x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" + +#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ + "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ + "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ + "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_2x16 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) + +#define COPY_2x8 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) + +#define COPY_2x4 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ + SAVE_2x4(0,1,4,5) + +#define COPY_2x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" + +#define COPY_2x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" + +#define SAVE_1x4(c1_no)\ + "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_1x16 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ + "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x8 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" + +#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" + +#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" + +#define COMPUTE(ndim){\ + src = src_base; dst = dst_base;\ + __asm__ __volatile__(\ + "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ + #ndim"31:\n\t"\ + COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ + #ndim"32:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33f;"\ + COPY_2x##ndim "subq $2,%%r11;"\ + #ndim"33:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"34f;"\ + COPY_1x##ndim "subq $1,%%r11;"\ + #ndim"34:\n\t"\ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\ + ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ +} +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; + BLASLONG cols_left, rows_done; float ALPHA = alpha; + if(ALPHA==0.0){ + dst_base = b; + for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;} + return 0; + } + for(rows_done=0;rows_done ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; + cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; + if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} + } + return 0; +} + +#else + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) + + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + + if (rows <= 0) return 0; + if (cols <= 0) return 0; + + a_offset = a; + b_offset = b; + + i = (rows >> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} + +#endif From 6f5667b4d4f395a5ccc8458abd053a35c7744f1d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Feb 2021 09:03:41 +0100 Subject: [PATCH 114/681] Enable optimized S/D OMATCOPY_RT --- kernel/x86_64/KERNEL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index b92f480e9..5da79cc3f 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c + +SOMATCOPY_RT = omatcopy_rt.c +DOMATCOPY_RT = omatcopy_rt.c From 325b398e3cefa8d04e6cb7e949d047e41e417271 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Feb 2021 09:13:12 +0100 Subject: [PATCH 115/681] Update omatcopy_rt.c --- kernel/x86_64/omatcopy_rt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c index ac25ea74b..e8cef22c1 100644 --- a/kernel/x86_64/omatcopy_rt.c +++ b/kernel/x86_64/omatcopy_rt.c @@ -29,7 +29,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAVE_AVX -/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ +#define ROWS_OF_BLOCK 384 + + /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ /* m: %5 = num_rows, %6 = alpha */ /* xmm15 = alpha */ #define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ From 292d1af1a04c60a24219dcb5db25de003171a97f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Feb 2021 09:34:14 +0100 Subject: [PATCH 116/681] Update omatcopy_rt.c --- kernel/x86_64/omatcopy_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c index e8cef22c1..e695f00c5 100644 --- a/kernel/x86_64/omatcopy_rt.c +++ b/kernel/x86_64/omatcopy_rt.c @@ -166,7 +166,7 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLO #else int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) - +{ BLASLONG i, j; FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; From ec6b354c32e939605331e2081590815a86413ca8 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 24 Feb 2021 14:07:20 +0100 Subject: [PATCH 117/681] use /usr/bin/env perl --- c_check | 2 +- exports/gensymbol | 2 +- f_check | 2 +- interface/create | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/c_check b/c_check index 9c8b1abac..e24943a29 100644 --- a/c_check +++ b/c_check @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #use File::Basename; # use File::Temp qw(tempfile); diff --git a/exports/gensymbol b/exports/gensymbol index 857a17a9e..e7210a030 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Changelog # 2017/09/03 staticfloat diff --git a/f_check b/f_check index ffe9c6b46..d044f2547 100644 --- a/f_check +++ b/f_check @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); diff --git a/interface/create b/interface/create index b7be8ab6e..0b9cefa2b 100755 --- a/interface/create +++ b/interface/create @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $count = 0; From 441c1161058feaa7119e84b86eb2d2a69929cc5c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Feb 2021 13:47:34 +0100 Subject: [PATCH 118/681] fix undefined CC again --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index d044f2547..2664e2d4a 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV("CC") =~ /clang/) { $flags = "-lomp"; } From 736f0146c33cbfa8e256ba85a28c304e1af7c620 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Feb 2021 04:18:04 +0100 Subject: [PATCH 119/681] Revert "Fix undefined CC in f_check (again)" --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 2664e2d4a..d044f2547 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $ENV("CC") =~ /clang/) { + if ($flags =~ /-lgomp/ && $CC =~ /clang/) { $flags = "-lomp"; } From 2d369bd916355e1b2c9612d962554948b6c5bb5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Feb 2021 09:09:43 +0100 Subject: [PATCH 120/681] fix undefined CC variable --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index d044f2547..fe947bf66 100644 --- a/f_check +++ b/f_check @@ -330,7 +330,7 @@ if ($link ne "") { $flags =~ s/\@/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { $flags = "-lomp"; } From 0571c3187b12afd7e55dfdd482743bf3134edc82 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 26 Feb 2021 20:56:34 -0600 Subject: [PATCH 121/681] POWER10: Rename mma builtins The LLVM and GCC teams agreed to rename the __builtin_mma_assemble_pair and __builtin_mma_disassemble_pair built-ins to __builtin_vsx_assemble_pair and __builtin_vsx_disassemble_pair respectively. This patch is to make corresponding changes in dgemm kernel. Also made changes in inputs to those builtins to avoid some potential typecasting issues. Reference gcc commit id:77ef995c1fbcab76a2a69b9f4700bcfd005d8e62 --- kernel/power/dgemm_kernel_power10.c | 77 ++++++++++++++--------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b531799a6..e918e61c3 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ @@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, { rowA = (vec_t *) & AO[l << 3]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); @@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); @@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } From 0197519dd71eb894e8ce02b78383242032b8c207 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:46:08 +0100 Subject: [PATCH 122/681] Delete cchkee.f --- lapack-netlib/TESTING/EIG/cchkee.f | 2507 ---------------------------- 1 file changed, 2507 deletions(-) delete mode 100644 lapack-netlib/TESTING/EIG/cchkee.f diff --git a/lapack-netlib/TESTING/EIG/cchkee.f b/lapack-netlib/TESTING/EIG/cchkee.f deleted file mode 100644 index f2a5f8d41..000000000 --- a/lapack-netlib/TESTING/EIG/cchkee.f +++ /dev/null @@ -1,2507 +0,0 @@ -*> \brief \b CCHKEE -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM CCHKEE -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> CCHKEE tests the COMPLEX LAPACK subroutines for the matrix -*> eigenvalue problem. The test paths in this version are -*> -*> NEP (Nonsymmetric Eigenvalue Problem): -*> Test CGEHRD, CUNGHR, CHSEQR, CTREVC, CHSEIN, and CUNMHR -*> -*> SEP (Hermitian Eigenvalue Problem): -*> Test CHETRD, CUNGTR, CSTEQR, CSTERF, CSTEIN, CSTEDC, -*> and drivers CHEEV(X), CHBEV(X), CHPEV(X), -*> CHEEVD, CHBEVD, CHPEVD -*> -*> SVD (Singular Value Decomposition): -*> Test CGEBRD, CUNGBR, and CBDSQR -*> and the drivers CGESVD, CGESDD -*> -*> CEV (Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test CGEEV -*> -*> CES (Nonsymmetric Schur form Driver): -*> Test CGEES -*> -*> CVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test CGEEVX -*> -*> CSX (Nonsymmetric Schur form Expert Driver): -*> Test CGEESX -*> -*> CGG (Generalized Nonsymmetric Eigenvalue Problem): -*> Test CGGHD3, CGGBAL, CGGBAK, CHGEQZ, and CTGEVC -*> -*> CGS (Generalized Nonsymmetric Schur form Driver): -*> Test CGGES -*> -*> CGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test CGGEV -*> -*> CGX (Generalized Nonsymmetric Schur form Expert Driver): -*> Test CGGESX -*> -*> CXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test CGGEVX -*> -*> CSG (Hermitian Generalized Eigenvalue Problem): -*> Test CHEGST, CHEGV, CHEGVD, CHEGVX, CHPGST, CHPGV, CHPGVD, -*> CHPGVX, CHBGST, CHBGV, CHBGVD, and CHBGVX -*> -*> CHB (Hermitian Band Eigenvalue Problem): -*> Test CHBTRD -*> -*> CBB (Band Singular Value Decomposition): -*> Test CGBBRD -*> -*> CEC (Eigencondition estimation): -*> Test CTRSYL, CTREXC, CTRSNA, and CTRSEN -*> -*> CBL (Balancing a general matrix) -*> Test CGEBAL -*> -*> CBK (Back transformation on a balanced matrix) -*> Test CGEBAK -*> -*> CGL (Balancing a matrix pair) -*> Test CGGBAL -*> -*> CGK (Back transformation on a matrix pair) -*> Test CGGBAK -*> -*> GLM (Generalized Linear Regression Model): -*> Tests CGGGLM -*> -*> GQR (Generalized QR and RQ factorizations): -*> Tests CGGQRF and CGGRQF -*> -*> GSV (Generalized Singular Value Decomposition): -*> Tests CGGSVD, CGGSVP, CTGSJA, CLAGS2, CLAPLL, and CLAPMT -*> -*> CSD (CS decomposition): -*> Tests CUNCSD -*> -*> LSE (Constrained Linear Least Squares): -*> Tests CGGLSE -*> -*> Each test path has a different set of inputs, but the data sets for -*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a -*> single input file. The first line of input should contain one of the -*> 3-character path names in columns 1-3. The number of remaining lines -*> depends on what is found on the first line. -*> -*> The number of matrix types used in testing is often controllable from -*> the input file. The number of matrix types for each path, and the -*> test routine that describes them, is as follows: -*> -*> Path name(s) Types Test routine -*> -*> CHS or NEP 21 CCHKHS -*> CST or SEP 21 CCHKST (routines) -*> 18 CDRVST (drivers) -*> CBD or SVD 16 CCHKBD (routines) -*> 5 CDRVBD (drivers) -*> CEV 21 CDRVEV -*> CES 21 CDRVES -*> CVX 21 CDRVVX -*> CSX 21 CDRVSX -*> CGG 26 CCHKGG (routines) -*> CGS 26 CDRGES -*> CGX 5 CDRGSX -*> CGV 26 CDRGEV -*> CXV 2 CDRGVX -*> CSG 21 CDRVSG -*> CHB 15 CCHKHB -*> CBB 15 CCHKBB -*> CEC - CCHKEC -*> CBL - CCHKBL -*> CBK - CCHKBK -*> CGL - CCHKGL -*> CGK - CCHKGK -*> GLM 8 CCKGLM -*> GQR 8 CCKGQR -*> GSV 8 CCKGSV -*> CSD 3 CCKCSD -*> LSE 8 CCKLSE -*> -*>----------------------------------------------------------------------- -*> -*> NEP input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NX, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: INMIN, INTEGER array, dimension (NPARMS) -*> LAHQR vs TTQRE crossover point, >= 11 -*> -*> line 9: INWIN, INTEGER array, dimension (NPARMS) -*> recommended deflation window size -*> -*> line 10: INIBL, INTEGER array, dimension (NPARMS) -*> nibble crossover point -*> -*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) -*> number of simultaneous shifts) -*> -*> line 12: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 0, 1 or 2) -*> -*> line 13: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. To have all of the test -*> ratios printed, use THRESH = 0.0 . -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow -*> the user to specify the matrix types. Each line contains -*> a 3-character path name in columns 1-3, and the number -*> of matrix types must be the first nonblank item in columns -*> 4-80. If the number of matrix types is at least 1 but is -*> less than the maximum number of possible types, a second -*> line will be read to get the numbers of the matrix types to -*> be used. For example, -*> NEP 21 -*> requests all of the matrix types for the nonsymmetric -*> eigenvalue problem, while -*> NEP 4 -*> 9 10 11 12 -*> requests only matrices of type 9, 10, 11, and 12. -*> -*> The valid 3-character path names are 'NEP' or 'CHS' for the -*> nonsymmetric eigenvalue routines. -*> -*>----------------------------------------------------------------------- -*> -*> SEP or CSG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, and NX. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 10: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 11: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 12: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 12 was 2: -*> -*> line 13: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 13-EOF: Lines specifying matrix types, as for NEP. -*> The valid 3-character path names are 'SEP' or 'CST' for the -*> Hermitian eigenvalue routines and driver routines, and -*> 'CSG' for the routines for the Hermitian generalized -*> eigenvalue problem. -*> -*>----------------------------------------------------------------------- -*> -*> SVD input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 5: NPARMS, INTEGER -*> Number of values of the parameter NB, NBMIN, NX, and NRHS. -*> -*> line 6: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 7: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 8: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 9: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 10: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 11: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 12: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 13: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SVD' or 'CBD' for both the -*> SVD routines and the SVD driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> CEV and CES data files: -*> -*> line 1: 'CEV' or 'CES' in columns 1 to 3. -*> -*> line 2: NSIZES, INTEGER -*> Number of sizes of matrices to use. Should be at least 0 -*> and at most 20. If NSIZES = 0, no testing is done -*> (although the remaining 3 lines are still read). -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHSEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8 and following: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CEV' to test CGEEV, or -*> 'CES' to test CGEES. -*> -*>----------------------------------------------------------------------- -*> -*> The CVX data has two parts. The first part is identical to CEV, -*> and the second part consists of test matrices with precomputed -*> solutions. -*> -*> line 1: 'CVX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: NEWSD, INTEGER -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> -*> lines 8 and following: The first line contains 'CVX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 1+N+N**2 lines, where N is -*> its dimension. The first line contains the dimension N and -*> ISRT (two integers). ISRT indicates whether the last N lines -*> are sorted by increasing real part of the eigenvalue -*> (ISRT=0) or by increasing imaginary part (ISRT=1). The next -*> N**2 lines contain the matrix rowwise, one entry per line. -*> The last N lines correspond to each eigenvalue. Each of -*> these last N lines contains 4 real values: the real part of -*> the eigenvalues, the imaginary part of the eigenvalue, the -*> reciprocal condition number of the eigenvalues, and the -*> reciprocal condition number of the vector eigenvector. The -*> end of data is indicated by dimension N=0. Even if no data -*> is to be tested, there must be at least one line containing -*> N=0. -*> -*>----------------------------------------------------------------------- -*> -*> The CSX data is like CVX. The first part is identical to CEV, and the -*> second part consists of test matrices with precomputed solutions. -*> -*> line 1: 'CSX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: NEWSD, INTEGER -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> -*> lines 8 and following: The first line contains 'CSX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 3+N**2 lines, where N is -*> its dimension. The first line contains the dimension N, the -*> dimension M of an invariant subspace, and ISRT. The second -*> line contains M integers, identifying the eigenvalues in the -*> invariant subspace (by their position in a list of -*> eigenvalues ordered by increasing real part (if ISRT=0) or -*> by increasing imaginary part (if ISRT=1)). The next N**2 -*> lines contain the matrix rowwise. The last line contains the -*> reciprocal condition number for the average of the selected -*> eigenvalues, and the reciprocal condition number for the -*> corresponding right invariant subspace. The end of data in -*> indicated by a line containing N=0, M=0, and ISRT = 0. Even -*> if no data is to be tested, there must be at least one line -*> containing N=0, M=0 and ISRT=0. -*> -*>----------------------------------------------------------------------- -*> -*> CGG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NBCOL, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for NBMIN, the minimum row dimension for blocks. -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of shifts. -*> -*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) -*> The values for MAXB, used in determining minimum blocksize. -*> -*> line 9: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 1 or 2) -*> -*> line 10: NBCOL, INTEGER array, dimension (NPARMS) -*> The values for NBCOL, the minimum column dimension for -*> blocks. -*> -*> line 11: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 12: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 13: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 14: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 15: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 15 was 2: -*> -*> line 16: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 17-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CGG' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> CGS and CGV input files: -*> -*> line 1: 'CGS' or 'CGV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension(NN) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 17 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 7-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CGS' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> CGX input file: -*> line 1: 'CGX' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next line contains an integer k such -*> that only the last k eigenvalues will be selected and appear -*> in the leading diagonal blocks of $A$ and $B$. The next N*N -*> lines contain the matrix A, one element per line. The next N*N -*> lines contain the matrix B. The last line contains the -*> reciprocal of the eigenvalue cluster condition number and the -*> reciprocal of the deflating subspace (associated with the -*> selected eigencluster) condition number. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> CXV input files: -*> line 1: 'CXV' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N*N lines contain the matrix A, one -*> element per line. The next N*N lines contain the matrix B. -*> The next line contains the reciprocals of the eigenvalue -*> condition numbers. The last line contains the reciprocals of -*> the eigenvector condition numbers. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> CHB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix dimension K. -*> -*> line 6: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CHB'. -*> -*>----------------------------------------------------------------------- -*> -*> CBB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix bandwidth K. -*> -*> line 6: NPARMS, INTEGER -*> Number of values of the parameter NRHS -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 9 was 2: -*> -*> line 10: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 10-EOF: Lines specifying matrix types, as for SVD. -*> The 3-character path name is 'CBB'. -*> -*>----------------------------------------------------------------------- -*> -*> CEC input file: -*> -*> line 2: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> lines 3-EOF: -*> -*> Input for testing the eigencondition routines consists of a set of -*> specially constructed test cases and their solutions. The data -*> format is not intended to be modified by the user. -*> -*>----------------------------------------------------------------------- -*> -*> CBL and CBK input files: -*> -*> line 1: 'CBL' in columns 1-3 to test CGEBAL, or 'CBK' in -*> columns 1-3 to test CGEBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> CGL and CGK input files: -*> -*> line 1: 'CGL' in columns 1-3 to test CGGBAL, or 'CGK' in -*> columns 1-3 to test CGGBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> GLM data file: -*> -*> line 1: 'GLM' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension), note M <= N <= M+P. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GLM' for the generalized -*> linear regression model routines. -*> -*>----------------------------------------------------------------------- -*> -*> GQR data file: -*> -*> line 1: 'GQR' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GQR' for the generalized -*> QR and RQ routines. -*> -*>----------------------------------------------------------------------- -*> -*> GSV data file: -*> -*> line 1: 'GSV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> CSD data file: -*> -*> line 1: 'CSD' in columns 1 to 3. -*> -*> line 2: NM, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NM) -*> Values of M (row and column dimension of orthogonal matrix). -*> -*> line 4: PVAL, INTEGER array, dimension(NM) -*> Values of P (row dimension of top-left block). -*> -*> line 5: NVAL, INTEGER array, dimension(NM) -*> Values of N (column dimension of top-left block). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CSD' for the CSD routine. -*> -*>----------------------------------------------------------------------- -*> -*> LSE data file: -*> -*> line 1: 'LSE' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N, note P <= N <= P+M. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> NMAX is currently set to 132 and must be at least 12 for some of the -*> precomputed examples, and LWORK = NMAX*(5*NMAX+20) in the parameter -*> statements below. For SVD, we assume NRHS may be as big as N. The -*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for CGG. -*> \endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date June 2016 -* -*> \ingroup complex_eig -* -* ===================================================================== - PROGRAM CCHKEE -* -* -- LAPACK test routine (version 3.7.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2016 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER NCMAX - PARAMETER ( NCMAX = 20 ) - INTEGER NEED - PARAMETER ( NEED = 14 ) - INTEGER LWORK - PARAMETER ( LWORK = NMAX*( 5*NMAX+20 ) ) - INTEGER LIWORK - PARAMETER ( LIWORK = NMAX*( NMAX+20 ) ) - INTEGER MAXIN - PARAMETER ( MAXIN = 20 ) - INTEGER MAXT - PARAMETER ( MAXT = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) -* .. -* .. Local Scalars .. - LOGICAL CBB, CBK, CBL, CES, CEV, CGG, CGK, CGL, CGS, - $ CGV, CGX, CHB, CSD, CSX, CVX, CXV, FATAL, GLM, - $ GQR, GSV, LSE, NEP, SEP, SVD, TSTCHK, TSTDIF, - $ TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*3 C3, PATH - CHARACTER*32 VNAME - CHARACTER*10 INTSTR - CHARACTER*80 LINE - INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, - $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THRESH, THRSHN -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) - INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), - $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), - $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ PVAL( MAXIN ) - INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), - $ ISHFTS( MAXIN ), IACC22( MAXIN ) - REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), - $ X( 5*NMAX ) -* .. -* .. External Functions .. - LOGICAL LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, CCHKBB, CCHKBD, CCHKBK, CCHKBL, CCHKEC, - $ CCHKGG, CCHKGK, CCHKGL, CCHKHB, CCHKHS, CCHKST, - $ CCKCSD, CCKGLM, CCKGQR, CCKGSV, CCKLSE, CDRGES, - $ CDRGEV, CDRGSX, CDRGVX, CDRVBD, CDRVES, CDRVEV, - $ CDRVSG, CDRVST, CDRVSX, CDRVVX, CERRBD, - $ CERRED, CERRGG, CERRHS, CERRST, ILAVER, XLAENV, - $ CDRGES3, CDRGEV3, - $ CCHKST2STG, CDRVST2STG, CCHKHB2STG -* .. -* .. Intrinsic Functions .. - INTRINSIC LEN, MIN -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, - $ SELOPT -* .. -* .. Arrays in Common .. - LOGICAL SELVAL( 20 ) - INTEGER IPARMS( 100 ) - REAL SELWI( 20 ), SELWR( 20 ) -* .. -* .. Common blocks .. - COMMON / CENVIR / NPROC, NSHIFT, MAXB - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI -* .. -* .. Data statements .. - DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / -* .. -* .. Executable Statements .. -* - A = 0.0 - B = 0.0 - C = 0.0 - DC = 0.0 - S1 = SECOND( ) - FATAL = .FALSE. - NUNIT = NOUT -* -* Return to here to read multiple sets of data -* - 10 CONTINUE -* -* Read the first line and set the 3-character test path -* - READ( NIN, FMT = '(A80)', END = 380 )LINE - PATH = LINE( 1: 3 ) - NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'CHS' ) - SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'CST' ) .OR. - $ LSAMEN( 3, PATH, 'CSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) - SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'CBD' ) - CEV = LSAMEN( 3, PATH, 'CEV' ) - CES = LSAMEN( 3, PATH, 'CES' ) - CVX = LSAMEN( 3, PATH, 'CVX' ) - CSX = LSAMEN( 3, PATH, 'CSX' ) - CGG = LSAMEN( 3, PATH, 'CGG' ) - CGS = LSAMEN( 3, PATH, 'CGS' ) - CGX = LSAMEN( 3, PATH, 'CGX' ) - CGV = LSAMEN( 3, PATH, 'CGV' ) - CXV = LSAMEN( 3, PATH, 'CXV' ) - CHB = LSAMEN( 3, PATH, 'CHB' ) - CBB = LSAMEN( 3, PATH, 'CBB' ) - GLM = LSAMEN( 3, PATH, 'GLM' ) - GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) - GSV = LSAMEN( 3, PATH, 'GSV' ) - CSD = LSAMEN( 3, PATH, 'CSD' ) - LSE = LSAMEN( 3, PATH, 'LSE' ) - CBL = LSAMEN( 3, PATH, 'CBL' ) - CBK = LSAMEN( 3, PATH, 'CBK' ) - CGL = LSAMEN( 3, PATH, 'CGL' ) - CGK = LSAMEN( 3, PATH, 'CGK' ) -* -* Report values of parameters. -* - IF( PATH.EQ.' ' ) THEN - GO TO 10 - ELSE IF( NEP ) THEN - WRITE( NOUT, FMT = 9987 ) - ELSE IF( SEP ) THEN - WRITE( NOUT, FMT = 9986 ) - ELSE IF( SVD ) THEN - WRITE( NOUT, FMT = 9985 ) - ELSE IF( CEV ) THEN - WRITE( NOUT, FMT = 9979 ) - ELSE IF( CES ) THEN - WRITE( NOUT, FMT = 9978 ) - ELSE IF( CVX ) THEN - WRITE( NOUT, FMT = 9977 ) - ELSE IF( CSX ) THEN - WRITE( NOUT, FMT = 9976 ) - ELSE IF( CGG ) THEN - WRITE( NOUT, FMT = 9975 ) - ELSE IF( CGS ) THEN - WRITE( NOUT, FMT = 9964 ) - ELSE IF( CGX ) THEN - WRITE( NOUT, FMT = 9965 ) - ELSE IF( CGV ) THEN - WRITE( NOUT, FMT = 9963 ) - ELSE IF( CXV ) THEN - WRITE( NOUT, FMT = 9962 ) - ELSE IF( CHB ) THEN - WRITE( NOUT, FMT = 9974 ) - ELSE IF( CBB ) THEN - WRITE( NOUT, FMT = 9967 ) - ELSE IF( GLM ) THEN - WRITE( NOUT, FMT = 9971 ) - ELSE IF( GQR ) THEN - WRITE( NOUT, FMT = 9970 ) - ELSE IF( GSV ) THEN - WRITE( NOUT, FMT = 9969 ) - ELSE IF( CSD ) THEN - WRITE( NOUT, FMT = 9960 ) - ELSE IF( LSE ) THEN - WRITE( NOUT, FMT = 9968 ) - ELSE IF( CBL ) THEN -* -* CGEBAL: Balancing -* - CALL CCHKBL( NIN, NOUT ) - GO TO 380 - ELSE IF( CBK ) THEN -* -* CGEBAK: Back transformation -* - CALL CCHKBK( NIN, NOUT ) - GO TO 380 - ELSE IF( CGL ) THEN -* -* CGGBAL: Balancing -* - CALL CCHKGL( NIN, NOUT ) - GO TO 380 - ELSE IF( CGK ) THEN -* -* CGGBAK: Back transformation -* - CALL CCHKGK( NIN, NOUT ) - GO TO 380 - ELSE IF( LSAMEN( 3, PATH, 'CEC' ) ) THEN -* -* CEC: Eigencondition estimation -* - READ( NIN, FMT = * )THRESH - CALL XLAENV( 1, 1 ) - CALL XLAENV( 12, 1 ) - TSTERR = .TRUE. - CALL CCHKEC( THRESH, TSTERR, NIN, NOUT ) - GO TO 380 - ELSE - WRITE( NOUT, FMT = 9992 )PATH - GO TO 380 - END IF - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH - WRITE( NOUT, FMT = 9984 ) -* -* Read the number of values of M, P, and N. -* - READ( NIN, FMT = * )NN - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF -* -* Read the values of M -* - IF( .NOT.( CGX .OR. CXV ) ) THEN - READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) - IF( SVD ) THEN - VNAME = ' M ' - ELSE - VNAME = ' N ' - END IF - DO 20 I = 1, NN - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) - END IF -* -* Read the values of P -* - IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN - READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) - DO 30 I = 1, NN - IF( PVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( PVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX - FATAL = .TRUE. - END IF - 30 CONTINUE - WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) - END IF -* -* Read the values of N -* - IF( SVD .OR. CBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. - $ LSE ) THEN - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 40 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 40 CONTINUE - ELSE - DO 50 I = 1, NN - NVAL( I ) = MVAL( I ) - 50 CONTINUE - END IF - IF( .NOT.( CGX .OR. CXV ) ) THEN - WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) - ELSE - WRITE( NOUT, FMT = 9983 )'N: ', NN - END IF -* -* Read the number of values of K, followed by the values of K -* - IF( CHB .OR. CBB ) THEN - READ( NIN, FMT = * )NK - READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) - DO 60 I = 1, NK - IF( KVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( KVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX - FATAL = .TRUE. - END IF - 60 CONTINUE - WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) - END IF -* - IF( CEV .OR. CES .OR. CVX .OR. CSX ) THEN -* -* For the nonsymmetric QR driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INWIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INIBL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( ISHFTS( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 - FATAL = .TRUE. - ELSE IF( IACC22( 1 ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) - CALL XLAENV(13, INWIN( 1 ) ) - CALL XLAENV(14, INIBL( 1 ) ) - CALL XLAENV(15, ISHFTS( 1 ) ) - CALL XLAENV(16, IACC22( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) - WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) -* - ELSE IF( CGS .OR. CGX .OR. CGV .OR. CXV ) THEN -* -* For the nonsymmetric generalized driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ NSVAL( 1 ), MXBVAL( 1 ) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NSVAL( 1 ).LT.2 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 - FATAL = .TRUE. - ELSE IF( MXBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV( 4, NSVAL( 1 ) ) - CALL XLAENV( 8, MXBVAL( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) - ELSE IF( .NOT.CHB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. - $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN -* -* For the other paths, the number of parameters can be varied -* from the input file. Read the number of parameter values. -* - READ( NIN, FMT = * )NPARMS - IF( NPARMS.LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 - NPARMS = 0 - FATAL = .TRUE. - ELSE IF( NPARMS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN - NPARMS = 0 - FATAL = .TRUE. - END IF -* -* Read the values of NB -* - IF( .NOT.CBB ) THEN - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) - DO 70 I = 1, NPARMS - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 70 CONTINUE - WRITE( NOUT, FMT = 9983 )'NB: ', - $ ( NBVAL( I ), I = 1, NPARMS ) - END IF -* -* Read the values of NBMIN -* - IF( NEP .OR. SEP .OR. SVD .OR. CGG ) THEN - READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) - DO 80 I = 1, NPARMS - IF( NBMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 - FATAL = .TRUE. - ELSE IF( NBMIN( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX - FATAL = .TRUE. - END IF - 80 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBMIN:', - $ ( NBMIN( I ), I = 1, NPARMS ) - ELSE - DO 90 I = 1, NPARMS - NBMIN( I ) = 1 - 90 CONTINUE - END IF -* -* Read the values of NX -* - IF( NEP .OR. SEP .OR. SVD ) THEN - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) - DO 100 I = 1, NPARMS - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NXVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX - FATAL = .TRUE. - END IF - 100 CONTINUE - WRITE( NOUT, FMT = 9983 )'NX: ', - $ ( NXVAL( I ), I = 1, NPARMS ) - ELSE - DO 110 I = 1, NPARMS - NXVAL( I ) = 1 - 110 CONTINUE - END IF -* -* Read the values of NSHIFT (if CGG) or NRHS (if SVD -* or CBB). -* - IF( SVD .OR. CBB .OR. CGG ) THEN - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) - DO 120 I = 1, NPARMS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX - FATAL = .TRUE. - END IF - 120 CONTINUE - WRITE( NOUT, FMT = 9983 )'NS: ', - $ ( NSVAL( I ), I = 1, NPARMS ) - ELSE - DO 130 I = 1, NPARMS - NSVAL( I ) = 1 - 130 CONTINUE - END IF -* -* Read the values for MAXB. -* - IF( CGG ) THEN - READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) - DO 140 I = 1, NPARMS - IF( MXBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MXBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 140 CONTINUE - WRITE( NOUT, FMT = 9983 )'MAXB: ', - $ ( MXBVAL( I ), I = 1, NPARMS ) - ELSE - DO 150 I = 1, NPARMS - MXBVAL( I ) = 1 - 150 CONTINUE - END IF -* -* Read the values for INMIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) - DO 540 I = 1, NPARMS - IF( INMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 - FATAL = .TRUE. - END IF - 540 CONTINUE - WRITE( NOUT, FMT = 9983 )'INMIN: ', - $ ( INMIN( I ), I = 1, NPARMS ) - ELSE - DO 550 I = 1, NPARMS - INMIN( I ) = 1 - 550 CONTINUE - END IF -* -* Read the values for INWIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) - DO 560 I = 1, NPARMS - IF( INWIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 - FATAL = .TRUE. - END IF - 560 CONTINUE - WRITE( NOUT, FMT = 9983 )'INWIN: ', - $ ( INWIN( I ), I = 1, NPARMS ) - ELSE - DO 570 I = 1, NPARMS - INWIN( I ) = 1 - 570 CONTINUE - END IF -* -* Read the values for INIBL. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) - DO 580 I = 1, NPARMS - IF( INIBL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 - FATAL = .TRUE. - END IF - 580 CONTINUE - WRITE( NOUT, FMT = 9983 )'INIBL: ', - $ ( INIBL( I ), I = 1, NPARMS ) - ELSE - DO 590 I = 1, NPARMS - INIBL( I ) = 1 - 590 CONTINUE - END IF -* -* Read the values for ISHFTS. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) - DO 600 I = 1, NPARMS - IF( ISHFTS( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 - FATAL = .TRUE. - END IF - 600 CONTINUE - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', - $ ( ISHFTS( I ), I = 1, NPARMS ) - ELSE - DO 610 I = 1, NPARMS - ISHFTS( I ) = 1 - 610 CONTINUE - END IF -* -* Read the values for IACC22. -* - IF( NEP .OR. CGG ) THEN - READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) - DO 620 I = 1, NPARMS - IF( IACC22( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 - FATAL = .TRUE. - END IF - 620 CONTINUE - WRITE( NOUT, FMT = 9983 )'IACC22: ', - $ ( IACC22( I ), I = 1, NPARMS ) - ELSE - DO 630 I = 1, NPARMS - IACC22( I ) = 1 - 630 CONTINUE - END IF -* -* Read the values for NBCOL. -* - IF( CGG ) THEN - READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) - DO 160 I = 1, NPARMS - IF( NBCOL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBCOL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX - FATAL = .TRUE. - END IF - 160 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBCOL:', - $ ( NBCOL( I ), I = 1, NPARMS ) - ELSE - DO 170 I = 1, NPARMS - NBCOL( I ) = 1 - 170 CONTINUE - END IF - END IF -* -* Calculate and print the machine dependent constants. -* - WRITE( NOUT, FMT = * ) - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9981 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9981 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9981 )'precision', EPS -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9982 )THRESH - IF( SEP .OR. SVD .OR. CGG ) THEN -* -* Read the flag that indicates whether to test LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test driver routines. -* - READ( NIN, FMT = * )TSTDRV - END IF -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* -* Read the code describing how to set the random number seed. -* - READ( NIN, FMT = * )NEWSD -* -* If NEWSD = 2, read another line with 4 integers for the seed. -* - IF( NEWSD.EQ.2 ) - $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) -* - DO 180 I = 1, 4 - ISEED( I ) = IOLDSD( I ) - 180 CONTINUE -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Read the input lines indicating the test path and its parameters. -* The first three characters indicate the test path, and the number -* of test matrix types must be the first nonblank item in columns -* 4-80. -* - 190 CONTINUE -* - IF( .NOT.( CGX .OR. CXV ) ) THEN -* - 200 CONTINUE - READ( NIN, FMT = '(A80)', END = 380 )LINE - C3 = LINE( 1: 3 ) - LENP = LEN( LINE ) - I = 3 - ITMP = 0 - I1 = 0 - 210 CONTINUE - I = I + 1 - IF( I.GT.LENP ) THEN - IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - NTYPES = MAXT - GO TO 240 - END IF - END IF - IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN - I1 = I - C1 = LINE( I1: I1 ) -* -* Check that a valid integer was read -* - DO 220 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 230 - END IF - 220 CONTINUE - WRITE( NOUT, FMT = 9991 )I, LINE - GO TO 200 - 230 CONTINUE - ITMP = 10*ITMP + IC - GO TO 210 - ELSE IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - GO TO 210 - END IF - 240 CONTINUE - NTYPES = ITMP -* -* Skip the tests if NTYPES is <= 0. -* - IF( .NOT.( CEV .OR. CES .OR. CVX .OR. CSX .OR. CGV .OR. - $ CGS ) .AND. NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - GO TO 200 - END IF -* - ELSE - IF( CGX ) - $ C3 = 'CGX' - IF( CXV ) - $ C3 = 'CXV' - END IF -* -* Reset the random number seed. -* - IF( NEWSD.EQ.0 ) THEN - DO 250 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 250 CONTINUE - END IF -* - IF( LSAMEN( 3, C3, 'CHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN -* -* ------------------------------------- -* NEP: Nonsymmetric Eigenvalue Problem -* ------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NS = number of shifts -* MAXB = minimum submatrix size -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL CERRHS( 'CHSEQR', NOUT ) - DO 270 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) - CALL XLAENV(13, INWIN( I ) ) - CALL XLAENV(14, INIBL( I ) ) - CALL XLAENV(15, ISHFTS( I ) ) - CALL XLAENV(16, IACC22( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 260 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 260 CONTINUE - END IF - WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), MAX( 11, INMIN(I)), - $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) - CALL CCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), - $ A( 1, 7 ), DC( 1, 1 ), DC( 1, 2 ), A( 1, 8 ), - $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ DC( 1, 3 ), WORK, LWORK, RWORK, IWORK, LOGWRK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKHS', INFO - 270 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'CST' ) .OR. LSAMEN( 3, C3, 'SEP' ) - $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN -* -* ---------------------------------- -* SEP: Symmetric Eigenvalue Problem -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL CERRST( 'CST', NOUT ) - DO 290 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 280 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 280 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL CCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), - $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), - $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - ELSE - CALL CCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), - $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), - $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKST', INFO - END IF - IF( TSTDRV ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL CDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), - $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), - $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL CDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), - $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), - $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRVST', INFO - END IF - 290 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'CSG' ) ) THEN -* -* ---------------------------------------------- -* CSG: Hermitian Generalized Eigenvalue Problem -* ---------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) - DO 310 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 300 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 300 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN -* CALL CDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, -* $ DR( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), -* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, -* $ LWORK, RWORK, LWORK, IWORK, LIWORK, RESULT, -* $ INFO ) - CALL CDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ DR( 1, 3 ), DR( 1, 4 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), - $ A( 1, 7 ), WORK, LWORK, RWORK, LWORK, - $ IWORK, LIWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRVSG', INFO - END IF - 310 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'CBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN -* -* ---------------------------------- -* SVD: Singular Value Decomposition -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NRHS = number of right hand sides -* - MAXTYP = 16 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) -* -* Test the error exits -* - CALL XLAENV( 1, 1 ) - IF( TSTERR .AND. TSTCHK ) - $ CALL CERRBD( 'CBD', NOUT ) - IF( TSTERR .AND. TSTDRV ) - $ CALL CERRED( 'CBD', NOUT ) -* - DO 330 I = 1, NPARMS - NRHS = NSVAL( I ) - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - IF( NEWSD.EQ.0 ) THEN - DO 320 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 320 CONTINUE - END IF - WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), NRHS - IF( TSTCHK ) THEN - CALL CCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, - $ THRESH, A( 1, 1 ), NMAX, DR( 1, 1 ), - $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), - $ A( 1, 2 ), NMAX, A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), NMAX, A( 1, 6 ), NMAX, A( 1, 7 ), - $ A( 1, 8 ), WORK, LWORK, RWORK, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKBD', INFO - END IF - IF( TSTDRV ) - $ CALL CDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), DR( 1, 1 ), DR( 1, 2 ), - $ DR( 1, 3 ), WORK, LWORK, RWORK, IWORK, NOUT, - $ INFO ) - 330 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'CEV' ) ) THEN -* -* -------------------------------------------- -* CEV: Nonsymmetric Eigenvalue Problem Driver -* CGEEV (eigenvalues and eigenvectors) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, - $ A( 1, 5 ), NMAX, RESULT, WORK, LWORK, RWORK, - $ IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CGEEV', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CES' ) ) THEN -* -* -------------------------------------------- -* CES: Nonsymmetric Eigenvalue Problem Driver -* CGEES (Schur form) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ DC( 1, 1 ), DC( 1, 2 ), A( 1, 4 ), NMAX, - $ RESULT, WORK, LWORK, RWORK, IWORK, LOGWRK, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CGEES', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CVX' ) ) THEN -* -* -------------------------------------------------------------- -* CVX: Nonsymmetric Eigenvalue Problem Expert Driver -* CGEEVX (eigenvalues, eigenvectors and condition numbers) -* -------------------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, - $ A( 1, 5 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), - $ DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), RESULT, WORK, LWORK, - $ RWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CGEEVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CSX' ) ) THEN -* -* --------------------------------------------------- -* CSX: Nonsymmetric Eigenvalue Problem Expert Driver -* CGEESX (Schur form and condition numbers) -* --------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), A( 1, 4 ), - $ NMAX, A( 1, 5 ), RESULT, WORK, LWORK, RWORK, - $ LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CGEESX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CGG' ) ) THEN -* -* ------------------------------------------------- -* CGG: Generalized Nonsymmetric Eigenvalue Problem -* ------------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NS = number of shifts -* MAXB = minimum submatrix size -* IACC22: structured matrix multiply -* NBCOL = minimum column dimension for blocks -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV(1,1) - IF( TSTCHK .AND. TSTERR ) - $ CALL CERRGG( C3, NOUT ) - DO 350 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 4, NSVAL( I ) ) - CALL XLAENV( 8, MXBVAL( I ) ) - CALL XLAENV( 16, IACC22( I ) ) - CALL XLAENV( 5, NBCOL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 340 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 340 CONTINUE - END IF - WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), - $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) - TSTDIF = .FALSE. - THRSHN = 10. - IF( TSTCHK ) THEN - CALL CCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), - $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), - $ DC( 1, 4 ), A( 1, 13 ), A( 1, 14 ), WORK, - $ LWORK, RWORK, LOGWRK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKGG', INFO - END IF - 350 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'CGS' ) ) THEN -* -* ------------------------------------------------- -* CGS: Generalized Nonsymmetric Eigenvalue Problem -* CGGES (Schur form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGES', INFO -* -* Blocked version -* - CALL XLAENV(16,2) - CALL CDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGES3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - - GO TO 10 -* - ELSE IF( CGX ) THEN -* -* ------------------------------------------------- -* CGX Generalized Nonsymmetric Eigenvalue Problem -* CGGESX (Schur form and condition numbers) -* ------------------------------------------------- -* - MAXTYP = 5 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 5, 2 ) - CALL CDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), DC( 1, 1 ), DC( 1, 2 ), C, - $ NCMAX*NCMAX, S, WORK, LWORK, RWORK, IWORK, - $ LIWORK, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGSX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CGV' ) ) THEN -* -* ------------------------------------------------- -* CGV: Generalized Nonsymmetric Eigenvalue Problem -* CGGEV (Eigenvalue/vector form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), - $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGEV', INFO -* -* Blocked version -* - CALL XLAENV(16,2) - CALL CDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), - $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGEV3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( CXV ) THEN -* -* ------------------------------------------------- -* CXV: Generalized Nonsymmetric Eigenvalue Problem -* CGGEVX (eigenvalue/vector with condition numbers) -* ------------------------------------------------- -* - MAXTYP = 2 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL CERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL CDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 5 ), A( 1, 6 ), IWORK( 1 ), - $ IWORK( 2 ), DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), WORK, - $ LWORK, RWORK, IWORK( 3 ), LIWORK-2, RESULT, - $ LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CDRGVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'CHB' ) ) THEN -* -* ------------------------------ -* CHB: Hermitian Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL CERRST( 'CHB', NOUT ) -* CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), -* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, -* $ INFO ) - CALL CCHKHB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), - $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), - $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKHB', INFO -* - ELSE IF( LSAMEN( 3, C3, 'CBB' ) ) THEN -* -* ------------------------------ -* CBB: General Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - DO 370 I = 1, NPARMS - NRHS = NSVAL( I ) -* - IF( NEWSD.EQ.0 ) THEN - DO 360 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 360 CONTINUE - END IF - WRITE( NOUT, FMT = 9966 )C3, NRHS - CALL CCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, - $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), 2*NMAX, DR( 1, 1 ), DR( 1, 2 ), - $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), - $ NMAX, A( 1, 7 ), WORK, LWORK, RWORK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCHKBB', INFO - 370 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN -* -* ----------------------------------------- -* GLM: Generalized Linear Regression Model -* ----------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL CERRGG( 'GLM', NOUT ) - CALL CCKGLM( NN, NVAL, MVAL, PVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCKGLM', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN -* -* ------------------------------------------ -* GQR: Generalized QR and RQ factorizations -* ------------------------------------------ -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL CERRGG( 'GQR', NOUT ) - CALL CCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, - $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, DR( 1, 1 ), NIN, - $ NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCKGQR', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN -* -* ---------------------------------------------- -* GSV: Generalized Singular Value Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL CERRGG( 'GSV', NOUT ) - CALL CCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), ALPHA, BETA, - $ B( 1, 4 ), IWORK, WORK, DR( 1, 1 ), NIN, NOUT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCKGSV', INFO -* - ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN -* -* ---------------------------------------------- -* CSD: CS Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL CERRGG( 'CSD', NOUT ) - CALL CCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), A( 1, 6 ), RWORK, IWORK, WORK, - $ DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCKCSD', INFO -* - ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN -* -* -------------------------------------- -* LSE: Constrained Linear Least Squares -* -------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL CERRGG( 'LSE', NOUT ) - CALL CCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'CCKLSE', INFO - ELSE - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = 9992 )C3 - END IF - IF( .NOT.( CGX .OR. CXV ) ) - $ GO TO 190 - 380 CONTINUE - WRITE( NOUT, FMT = 9994 ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) - 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, - $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) - 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', NRHS =', I4 ) - 9994 FORMAT( / / ' End of tests' ) - 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) - 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, - $ ' of input', ' line:', / A79 ) - 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) - 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', - $ I6 ) - 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', - $ I6 ) - 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) - 9986 FORMAT( ' Tests of the Hermitian Eigenvalue Problem routines' ) - 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) - 9984 FORMAT( / ' The following parameter values will be used:' ) - 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) - 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9981 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) - 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' CGEEV (eigenvalues and eigevectors)' ) - 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' CGEES (Schur form)' ) - 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' CGEEVX (eigenvalues, eigenvectors and', - $ ' condition numbers)' ) - 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' CGEESX (Schur form and condition', - $ ' numbers)' ) - 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem routines' ) - 9974 FORMAT( ' Tests of CHBTRD', / ' (reduction of a Hermitian band ', - $ 'matrix to real tridiagonal form)' ) - 9973 FORMAT( / 1X, 71( '-' ) ) - 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) - 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', - $ 'routines' ) - 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) - 9969 FORMAT( / ' Tests of the Generalized Singular Value', - $ ' Decomposition routines' ) - 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) - 9967 FORMAT( ' Tests of CGBBRD', / ' (reduction of a general band ', - $ 'matrix to real bidiagonal form)' ) - 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) - 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver CGGESX' ) - 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver CGGES' ) - 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver CGGEV' ) - 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver CGGEVX' ) - 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', INMIN=', I4, - $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, - $ ', IACC22 =', I4) - 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) -* -* End of CCHKEE -* - END From ee16efff3cd5a4ee7b6c0efcc263964f1304a3a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:46:38 +0100 Subject: [PATCH 123/681] Delete dchkee.f --- lapack-netlib/TESTING/EIG/dchkee.f | 2507 ---------------------------- 1 file changed, 2507 deletions(-) delete mode 100644 lapack-netlib/TESTING/EIG/dchkee.f diff --git a/lapack-netlib/TESTING/EIG/dchkee.f b/lapack-netlib/TESTING/EIG/dchkee.f deleted file mode 100644 index dc6f3205a..000000000 --- a/lapack-netlib/TESTING/EIG/dchkee.f +++ /dev/null @@ -1,2507 +0,0 @@ -*> \brief \b DCHKEE -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM DCHKEE -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> DCHKEE tests the DOUBLE PRECISION LAPACK subroutines for the matrix -*> eigenvalue problem. The test paths in this version are -*> -*> NEP (Nonsymmetric Eigenvalue Problem): -*> Test DGEHRD, DORGHR, DHSEQR, DTREVC, DHSEIN, and DORMHR -*> -*> SEP (Symmetric Eigenvalue Problem): -*> Test DSYTRD, DORGTR, DSTEQR, DSTERF, DSTEIN, DSTEDC, -*> and drivers DSYEV(X), DSBEV(X), DSPEV(X), DSTEV(X), -*> DSYEVD, DSBEVD, DSPEVD, DSTEVD -*> -*> SVD (Singular Value Decomposition): -*> Test DGEBRD, DORGBR, DBDSQR, DBDSDC -*> and the drivers DGESVD, DGESDD -*> -*> DEV (Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test DGEEV -*> -*> DES (Nonsymmetric Schur form Driver): -*> Test DGEES -*> -*> DVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test DGEEVX -*> -*> DSX (Nonsymmetric Schur form Expert Driver): -*> Test DGEESX -*> -*> DGG (Generalized Nonsymmetric Eigenvalue Problem): -*> Test DGGHD3, DGGBAL, DGGBAK, DHGEQZ, and DTGEVC -*> -*> DGS (Generalized Nonsymmetric Schur form Driver): -*> Test DGGES -*> -*> DGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test DGGEV -*> -*> DGX (Generalized Nonsymmetric Schur form Expert Driver): -*> Test DGGESX -*> -*> DXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test DGGEVX -*> -*> DSG (Symmetric Generalized Eigenvalue Problem): -*> Test DSYGST, DSYGV, DSYGVD, DSYGVX, DSPGST, DSPGV, DSPGVD, -*> DSPGVX, DSBGST, DSBGV, DSBGVD, and DSBGVX -*> -*> DSB (Symmetric Band Eigenvalue Problem): -*> Test DSBTRD -*> -*> DBB (Band Singular Value Decomposition): -*> Test DGBBRD -*> -*> DEC (Eigencondition estimation): -*> Test DLALN2, DLASY2, DLAEQU, DLAEXC, DTRSYL, DTREXC, DTRSNA, -*> DTRSEN, and DLAQTR -*> -*> DBL (Balancing a general matrix) -*> Test DGEBAL -*> -*> DBK (Back transformation on a balanced matrix) -*> Test DGEBAK -*> -*> DGL (Balancing a matrix pair) -*> Test DGGBAL -*> -*> DGK (Back transformation on a matrix pair) -*> Test DGGBAK -*> -*> GLM (Generalized Linear Regression Model): -*> Tests DGGGLM -*> -*> GQR (Generalized QR and RQ factorizations): -*> Tests DGGQRF and DGGRQF -*> -*> GSV (Generalized Singular Value Decomposition): -*> Tests DGGSVD, DGGSVP, DTGSJA, DLAGS2, DLAPLL, and DLAPMT -*> -*> CSD (CS decomposition): -*> Tests DORCSD -*> -*> LSE (Constrained Linear Least Squares): -*> Tests DGGLSE -*> -*> Each test path has a different set of inputs, but the data sets for -*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a -*> single input file. The first line of input should contain one of the -*> 3-character path names in columns 1-3. The number of remaining lines -*> depends on what is found on the first line. -*> -*> The number of matrix types used in testing is often controllable from -*> the input file. The number of matrix types for each path, and the -*> test routine that describes them, is as follows: -*> -*> Path name(s) Types Test routine -*> -*> DHS or NEP 21 DCHKHS -*> DST or SEP 21 DCHKST (routines) -*> 18 DDRVST (drivers) -*> DBD or SVD 16 DCHKBD (routines) -*> 5 DDRVBD (drivers) -*> DEV 21 DDRVEV -*> DES 21 DDRVES -*> DVX 21 DDRVVX -*> DSX 21 DDRVSX -*> DGG 26 DCHKGG (routines) -*> DGS 26 DDRGES -*> DGX 5 DDRGSX -*> DGV 26 DDRGEV -*> DXV 2 DDRGVX -*> DSG 21 DDRVSG -*> DSB 15 DCHKSB -*> DBB 15 DCHKBB -*> DEC - DCHKEC -*> DBL - DCHKBL -*> DBK - DCHKBK -*> DGL - DCHKGL -*> DGK - DCHKGK -*> GLM 8 DCKGLM -*> GQR 8 DCKGQR -*> GSV 8 DCKGSV -*> CSD 3 DCKCSD -*> LSE 8 DCKLSE -*> -*>----------------------------------------------------------------------- -*> -*> NEP input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NX, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: INMIN, INTEGER array, dimension (NPARMS) -*> LAHQR vs TTQRE crossover point, >= 11 -*> -*> line 9: INWIN, INTEGER array, dimension (NPARMS) -*> recommended deflation window size -*> -*> line 10: INIBL, INTEGER array, dimension (NPARMS) -*> nibble crossover point -*> -*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) -*> number of simultaneous shifts) -*> -*> line 12: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 0, 1 or 2) -*> -*> line 13: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. To have all of the test -*> ratios printed, use THRESH = 0.0 . -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow -*> the user to specify the matrix types. Each line contains -*> a 3-character path name in columns 1-3, and the number -*> of matrix types must be the first nonblank item in columns -*> 4-80. If the number of matrix types is at least 1 but is -*> less than the maximum number of possible types, a second -*> line will be read to get the numbers of the matrix types to -*> be used. For example, -*> NEP 21 -*> requests all of the matrix types for the nonsymmetric -*> eigenvalue problem, while -*> NEP 4 -*> 9 10 11 12 -*> requests only matrices of type 9, 10, 11, and 12. -*> -*> The valid 3-character path names are 'NEP' or 'SHS' for the -*> nonsymmetric eigenvalue routines. -*> -*>----------------------------------------------------------------------- -*> -*> SEP or DSG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, and NX. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 10: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 11: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 12: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 12 was 2: -*> -*> line 13: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 13-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SEP' or 'SST' for the -*> symmetric eigenvalue routines and driver routines, and -*> 'DSG' for the routines for the symmetric generalized -*> eigenvalue problem. -*> -*>----------------------------------------------------------------------- -*> -*> SVD input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 5: NPARMS, INTEGER -*> Number of values of the parameter NB, NBMIN, NX, and NRHS. -*> -*> line 6: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 7: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 8: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 9: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 10: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 11: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 12: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 13: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SVD' or 'SBD' for both the -*> SVD routines and the SVD driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> DEV and DES data files: -*> -*> line 1: 'DEV' or 'DES' in columns 1 to 3. -*> -*> line 2: NSIZES, INTEGER -*> Number of sizes of matrices to use. Should be at least 0 -*> and at most 20. If NSIZES = 0, no testing is done -*> (although the remaining 3 lines are still read). -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHSEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9 and following: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'DEV' to test SGEEV, or -*> 'DES' to test SGEES. -*> -*>----------------------------------------------------------------------- -*> -*> The DVX data has two parts. The first part is identical to DEV, -*> and the second part consists of test matrices with precomputed -*> solutions. -*> -*> line 1: 'DVX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: TSTERR, LOGICAL -*> -*> line 7: NEWSD, INTEGER -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> -*> lines 9 and following: The first line contains 'DVX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 1+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N lines contain the matrix, one -*> row per line. The last N lines correspond to each -*> eigenvalue. Each of these last N lines contains 4 real -*> values: the real part of the eigenvalue, the imaginary -*> part of the eigenvalue, the reciprocal condition number of -*> the eigenvalues, and the reciprocal condition number of the -*> eigenvector. The end of data is indicated by dimension N=0. -*> Even if no data is to be tested, there must be at least one -*> line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> The DSX data is like DVX. The first part is identical to DEV, and the -*> second part consists of test matrices with precomputed solutions. -*> -*> line 1: 'DSX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: TSTERR, LOGICAL -*> -*> line 7: NEWSD, INTEGER -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> -*> lines 9 and following: The first line contains 'DSX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 3+N lines, where N is its -*> dimension. The first line contains the dimension N and the -*> dimension M of an invariant subspace. The second line -*> contains M integers, identifying the eigenvalues in the -*> invariant subspace (by their position in a list of -*> eigenvalues ordered by increasing real part). The next N -*> lines contain the matrix. The last line contains the -*> reciprocal condition number for the average of the selected -*> eigenvalues, and the reciprocal condition number for the -*> corresponding right invariant subspace. The end of data is -*> indicated by a line containing N=0 and M=0. Even if no data -*> is to be tested, there must be at least one line containing -*> N=0 and M=0. -*> -*>----------------------------------------------------------------------- -*> -*> DGG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NS, MAXB, and -*> NBCOL. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for NBMIN, the minimum row dimension for blocks. -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of shifts. -*> -*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) -*> The values for MAXB, used in determining minimum blocksize. -*> -*> line 9: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 1 or 2) -*> -*> line 10: NBCOL, INTEGER array, dimension (NPARMS) -*> The values for NBCOL, the minimum column dimension for -*> blocks. -*> -*> line 11: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 12: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 13: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 14: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 15: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 15 was 2: -*> -*> line 16: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 17-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'DGG' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> DGS and DGV input files: -*> -*> line 1: 'DGS' or 'DGV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension(NN) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 17 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 7-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'DGS' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> DXV input files: -*> -*> line 1: 'DXV' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N lines contain the matrix A, one -*> row per line. The next N lines contain the matrix B. The -*> next line contains the reciprocals of the eigenvalue -*> condition numbers. The last line contains the reciprocals of -*> the eigenvector condition numbers. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> DGX input files: -*> -*> line 1: 'DGX' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next line contains an integer k such -*> that only the last k eigenvalues will be selected and appear -*> in the leading diagonal blocks of $A$ and $B$. The next N -*> lines contain the matrix A, one row per line. The next N -*> lines contain the matrix B. The last line contains the -*> reciprocal of the eigenvalue cluster condition number and the -*> reciprocal of the deflating subspace (associated with the -*> selected eigencluster) condition number. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> DSB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix dimension K. -*> -*> line 6: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'DSB'. -*> -*>----------------------------------------------------------------------- -*> -*> DBB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix bandwidth K. -*> -*> line 6: NPARMS, INTEGER -*> Number of values of the parameter NRHS -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 9 was 2: -*> -*> line 10: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 10-EOF: Lines specifying matrix types, as for SVD. -*> The 3-character path name is 'DBB'. -*> -*>----------------------------------------------------------------------- -*> -*> DEC input file: -*> -*> line 2: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> lines 3-EOF: -*> -*> Input for testing the eigencondition routines consists of a set of -*> specially constructed test cases and their solutions. The data -*> format is not intended to be modified by the user. -*> -*>----------------------------------------------------------------------- -*> -*> DBL and DBK input files: -*> -*> line 1: 'DBL' in columns 1-3 to test SGEBAL, or 'DBK' in -*> columns 1-3 to test SGEBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> DGL and DGK input files: -*> -*> line 1: 'DGL' in columns 1-3 to test DGGBAL, or 'DGK' in -*> columns 1-3 to test DGGBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> GLM data file: -*> -*> line 1: 'GLM' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension), note M <= N <= M+P. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GLM' for the generalized -*> linear regression model routines. -*> -*>----------------------------------------------------------------------- -*> -*> GQR data file: -*> -*> line 1: 'GQR' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GQR' for the generalized -*> QR and RQ routines. -*> -*>----------------------------------------------------------------------- -*> -*> GSV data file: -*> -*> line 1: 'GSV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> CSD data file: -*> -*> line 1: 'CSD' in columns 1 to 3. -*> -*> line 2: NM, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NM) -*> Values of M (row and column dimension of orthogonal matrix). -*> -*> line 4: PVAL, INTEGER array, dimension(NM) -*> Values of P (row dimension of top-left block). -*> -*> line 5: NVAL, INTEGER array, dimension(NM) -*> Values of N (column dimension of top-left block). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CSD' for the CSD routine. -*> -*>----------------------------------------------------------------------- -*> -*> LSE data file: -*> -*> line 1: 'LSE' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N, note P <= N <= P+M. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> NMAX is currently set to 132 and must be at least 12 for some of the -*> precomputed examples, and LWORK = NMAX*(5*NMAX+5)+1 in the parameter -*> statements below. For SVD, we assume NRHS may be as big as N. The -*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for DGG. -*> \endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date June 2016 -* -*> \ingroup double_eig -* -* ===================================================================== - PROGRAM DCHKEE -* -* -- LAPACK test routine (version 3.7.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2016 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER NCMAX - PARAMETER ( NCMAX = 20 ) - INTEGER NEED - PARAMETER ( NEED = 14 ) - INTEGER LWORK - PARAMETER ( LWORK = NMAX*( 5*NMAX+5 )+1 ) - INTEGER LIWORK - PARAMETER ( LIWORK = NMAX*( 5*NMAX+20 ) ) - INTEGER MAXIN - PARAMETER ( MAXIN = 20 ) - INTEGER MAXT - PARAMETER ( MAXT = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) -* .. -* .. Local Scalars .. - LOGICAL CSD, DBB, DGG, DSB, FATAL, GLM, GQR, GSV, LSE, - $ NEP, DBK, DBL, SEP, DES, DEV, DGK, DGL, DGS, - $ DGV, DGX, DSX, SVD, DVX, DXV, TSTCHK, TSTDIF, - $ TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*3 C3, PATH - CHARACTER*32 VNAME - CHARACTER*10 INTSTR - CHARACTER*80 LINE - INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, - $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) - INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), - $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), - $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ PVAL( MAXIN ) - INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), - $ ISHFTS( MAXIN ), IACC22( MAXIN ) - DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) -* .. -* .. External Functions .. - LOGICAL LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, DCHKBB, DCHKBD, DCHKBK, DCHKBL, DCHKEC, - $ DCHKGG, DCHKGK, DCHKGL, DCHKHS, DCHKSB, DCHKST, - $ DCKCSD, DCKGLM, DCKGQR, DCKGSV, DCKLSE, DDRGES, - $ DDRGEV, DDRGSX, DDRGVX, DDRVBD, DDRVES, DDRVEV, - $ DDRVSG, DDRVST, DDRVSX, DDRVVX, DERRBD, - $ DERRED, DERRGG, DERRHS, DERRST, ILAVER, XLAENV, - $ DDRGES3, DDRGEV3, - $ DCHKST2STG, DDRVST2STG, DCHKSB2STG, DDRVSG2STG -* .. -* .. Intrinsic Functions .. - INTRINSIC LEN, MIN -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, - $ SELOPT -* .. -* .. Arrays in Common .. - LOGICAL SELVAL( 20 ) - INTEGER IPARMS( 100 ) - DOUBLE PRECISION SELWI( 20 ), SELWR( 20 ) -* .. -* .. Common blocks .. - COMMON / CENVIR / NPROC, NSHIFT, MAXB - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / -* .. -* .. Executable Statements .. -* - A = 0.0 - B = 0.0 - C = 0.0 - D = 0.0 - S1 = DSECND( ) - FATAL = .FALSE. - NUNIT = NOUT -* -* Return to here to read multiple sets of data -* - 10 CONTINUE -* -* Read the first line and set the 3-character test path -* - READ( NIN, FMT = '(A80)', END = 380 )LINE - PATH = LINE( 1: 3 ) - NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'DHS' ) - SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'DST' ) .OR. - $ LSAMEN( 3, PATH, 'DSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) - SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'DBD' ) - DEV = LSAMEN( 3, PATH, 'DEV' ) - DES = LSAMEN( 3, PATH, 'DES' ) - DVX = LSAMEN( 3, PATH, 'DVX' ) - DSX = LSAMEN( 3, PATH, 'DSX' ) - DGG = LSAMEN( 3, PATH, 'DGG' ) - DGS = LSAMEN( 3, PATH, 'DGS' ) - DGX = LSAMEN( 3, PATH, 'DGX' ) - DGV = LSAMEN( 3, PATH, 'DGV' ) - DXV = LSAMEN( 3, PATH, 'DXV' ) - DSB = LSAMEN( 3, PATH, 'DSB' ) - DBB = LSAMEN( 3, PATH, 'DBB' ) - GLM = LSAMEN( 3, PATH, 'GLM' ) - GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) - GSV = LSAMEN( 3, PATH, 'GSV' ) - CSD = LSAMEN( 3, PATH, 'CSD' ) - LSE = LSAMEN( 3, PATH, 'LSE' ) - DBL = LSAMEN( 3, PATH, 'DBL' ) - DBK = LSAMEN( 3, PATH, 'DBK' ) - DGL = LSAMEN( 3, PATH, 'DGL' ) - DGK = LSAMEN( 3, PATH, 'DGK' ) -* -* Report values of parameters. -* - IF( PATH.EQ.' ' ) THEN - GO TO 10 - ELSE IF( NEP ) THEN - WRITE( NOUT, FMT = 9987 ) - ELSE IF( SEP ) THEN - WRITE( NOUT, FMT = 9986 ) - ELSE IF( SVD ) THEN - WRITE( NOUT, FMT = 9985 ) - ELSE IF( DEV ) THEN - WRITE( NOUT, FMT = 9979 ) - ELSE IF( DES ) THEN - WRITE( NOUT, FMT = 9978 ) - ELSE IF( DVX ) THEN - WRITE( NOUT, FMT = 9977 ) - ELSE IF( DSX ) THEN - WRITE( NOUT, FMT = 9976 ) - ELSE IF( DGG ) THEN - WRITE( NOUT, FMT = 9975 ) - ELSE IF( DGS ) THEN - WRITE( NOUT, FMT = 9964 ) - ELSE IF( DGX ) THEN - WRITE( NOUT, FMT = 9965 ) - ELSE IF( DGV ) THEN - WRITE( NOUT, FMT = 9963 ) - ELSE IF( DXV ) THEN - WRITE( NOUT, FMT = 9962 ) - ELSE IF( DSB ) THEN - WRITE( NOUT, FMT = 9974 ) - ELSE IF( DBB ) THEN - WRITE( NOUT, FMT = 9967 ) - ELSE IF( GLM ) THEN - WRITE( NOUT, FMT = 9971 ) - ELSE IF( GQR ) THEN - WRITE( NOUT, FMT = 9970 ) - ELSE IF( GSV ) THEN - WRITE( NOUT, FMT = 9969 ) - ELSE IF( CSD ) THEN - WRITE( NOUT, FMT = 9960 ) - ELSE IF( LSE ) THEN - WRITE( NOUT, FMT = 9968 ) - ELSE IF( DBL ) THEN -* -* DGEBAL: Balancing -* - CALL DCHKBL( NIN, NOUT ) - GO TO 10 - ELSE IF( DBK ) THEN -* -* DGEBAK: Back transformation -* - CALL DCHKBK( NIN, NOUT ) - GO TO 10 - ELSE IF( DGL ) THEN -* -* DGGBAL: Balancing -* - CALL DCHKGL( NIN, NOUT ) - GO TO 10 - ELSE IF( DGK ) THEN -* -* DGGBAK: Back transformation -* - CALL DCHKGK( NIN, NOUT ) - GO TO 10 - ELSE IF( LSAMEN( 3, PATH, 'DEC' ) ) THEN -* -* DEC: Eigencondition estimation -* - READ( NIN, FMT = * )THRESH - CALL XLAENV( 1, 1 ) - CALL XLAENV( 12, 11 ) - CALL XLAENV( 13, 2 ) - CALL XLAENV( 14, 0 ) - CALL XLAENV( 15, 2 ) - CALL XLAENV( 16, 2 ) - TSTERR = .TRUE. - CALL DCHKEC( THRESH, TSTERR, NIN, NOUT ) - GO TO 10 - ELSE - WRITE( NOUT, FMT = 9992 )PATH - GO TO 10 - END IF - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH - WRITE( NOUT, FMT = 9984 ) -* -* Read the number of values of M, P, and N. -* - READ( NIN, FMT = * )NN - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF -* -* Read the values of M -* - IF( .NOT.( DGX .OR. DXV ) ) THEN - READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) - IF( SVD ) THEN - VNAME = ' M ' - ELSE - VNAME = ' N ' - END IF - DO 20 I = 1, NN - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) - END IF -* -* Read the values of P -* - IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN - READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) - DO 30 I = 1, NN - IF( PVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( PVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX - FATAL = .TRUE. - END IF - 30 CONTINUE - WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) - END IF -* -* Read the values of N -* - IF( SVD .OR. DBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. - $ LSE ) THEN - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 40 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 40 CONTINUE - ELSE - DO 50 I = 1, NN - NVAL( I ) = MVAL( I ) - 50 CONTINUE - END IF - IF( .NOT.( DGX .OR. DXV ) ) THEN - WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) - ELSE - WRITE( NOUT, FMT = 9983 )'N: ', NN - END IF -* -* Read the number of values of K, followed by the values of K -* - IF( DSB .OR. DBB ) THEN - READ( NIN, FMT = * )NK - READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) - DO 60 I = 1, NK - IF( KVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( KVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX - FATAL = .TRUE. - END IF - 60 CONTINUE - WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) - END IF -* - IF( DEV .OR. DES .OR. DVX .OR. DSX ) THEN -* -* For the nonsymmetric QR driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INWIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INIBL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( ISHFTS( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 - FATAL = .TRUE. - ELSE IF( IACC22( 1 ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) - CALL XLAENV(13, INWIN( 1 ) ) - CALL XLAENV(14, INIBL( 1 ) ) - CALL XLAENV(15, ISHFTS( 1 ) ) - CALL XLAENV(16, IACC22( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) - WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) -* - ELSEIF( DGS .OR. DGX .OR. DGV .OR. DXV ) THEN -* -* For the nonsymmetric generalized driver routines, only one set -* of parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ NSVAL( 1 ), MXBVAL( 1 ) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NSVAL( 1 ).LT.2 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 - FATAL = .TRUE. - ELSE IF( MXBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV( 4, NSVAL( 1 ) ) - CALL XLAENV( 8, MXBVAL( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) -* - ELSE IF( .NOT.DSB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. - $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN -* -* For the other paths, the number of parameters can be varied -* from the input file. Read the number of parameter values. -* - READ( NIN, FMT = * )NPARMS - IF( NPARMS.LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 - NPARMS = 0 - FATAL = .TRUE. - ELSE IF( NPARMS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN - NPARMS = 0 - FATAL = .TRUE. - END IF -* -* Read the values of NB -* - IF( .NOT.DBB ) THEN - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) - DO 70 I = 1, NPARMS - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 70 CONTINUE - WRITE( NOUT, FMT = 9983 )'NB: ', - $ ( NBVAL( I ), I = 1, NPARMS ) - END IF -* -* Read the values of NBMIN -* - IF( NEP .OR. SEP .OR. SVD .OR. DGG ) THEN - READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) - DO 80 I = 1, NPARMS - IF( NBMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 - FATAL = .TRUE. - ELSE IF( NBMIN( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX - FATAL = .TRUE. - END IF - 80 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBMIN:', - $ ( NBMIN( I ), I = 1, NPARMS ) - ELSE - DO 90 I = 1, NPARMS - NBMIN( I ) = 1 - 90 CONTINUE - END IF -* -* Read the values of NX -* - IF( NEP .OR. SEP .OR. SVD ) THEN - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) - DO 100 I = 1, NPARMS - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NXVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX - FATAL = .TRUE. - END IF - 100 CONTINUE - WRITE( NOUT, FMT = 9983 )'NX: ', - $ ( NXVAL( I ), I = 1, NPARMS ) - ELSE - DO 110 I = 1, NPARMS - NXVAL( I ) = 1 - 110 CONTINUE - END IF -* -* Read the values of NSHIFT (if DGG) or NRHS (if SVD -* or DBB). -* - IF( SVD .OR. DBB .OR. DGG ) THEN - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) - DO 120 I = 1, NPARMS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX - FATAL = .TRUE. - END IF - 120 CONTINUE - WRITE( NOUT, FMT = 9983 )'NS: ', - $ ( NSVAL( I ), I = 1, NPARMS ) - ELSE - DO 130 I = 1, NPARMS - NSVAL( I ) = 1 - 130 CONTINUE - END IF -* -* Read the values for MAXB. -* - IF( DGG ) THEN - READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) - DO 140 I = 1, NPARMS - IF( MXBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MXBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 140 CONTINUE - WRITE( NOUT, FMT = 9983 )'MAXB: ', - $ ( MXBVAL( I ), I = 1, NPARMS ) - ELSE - DO 150 I = 1, NPARMS - MXBVAL( I ) = 1 - 150 CONTINUE - END IF -* -* Read the values for INMIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) - DO 540 I = 1, NPARMS - IF( INMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 - FATAL = .TRUE. - END IF - 540 CONTINUE - WRITE( NOUT, FMT = 9983 )'INMIN: ', - $ ( INMIN( I ), I = 1, NPARMS ) - ELSE - DO 550 I = 1, NPARMS - INMIN( I ) = 1 - 550 CONTINUE - END IF -* -* Read the values for INWIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) - DO 560 I = 1, NPARMS - IF( INWIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 - FATAL = .TRUE. - END IF - 560 CONTINUE - WRITE( NOUT, FMT = 9983 )'INWIN: ', - $ ( INWIN( I ), I = 1, NPARMS ) - ELSE - DO 570 I = 1, NPARMS - INWIN( I ) = 1 - 570 CONTINUE - END IF -* -* Read the values for INIBL. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) - DO 580 I = 1, NPARMS - IF( INIBL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 - FATAL = .TRUE. - END IF - 580 CONTINUE - WRITE( NOUT, FMT = 9983 )'INIBL: ', - $ ( INIBL( I ), I = 1, NPARMS ) - ELSE - DO 590 I = 1, NPARMS - INIBL( I ) = 1 - 590 CONTINUE - END IF -* -* Read the values for ISHFTS. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) - DO 600 I = 1, NPARMS - IF( ISHFTS( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 - FATAL = .TRUE. - END IF - 600 CONTINUE - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', - $ ( ISHFTS( I ), I = 1, NPARMS ) - ELSE - DO 610 I = 1, NPARMS - ISHFTS( I ) = 1 - 610 CONTINUE - END IF -* -* Read the values for IACC22. -* - IF( NEP .OR. DGG ) THEN - READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) - DO 620 I = 1, NPARMS - IF( IACC22( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 - FATAL = .TRUE. - END IF - 620 CONTINUE - WRITE( NOUT, FMT = 9983 )'IACC22: ', - $ ( IACC22( I ), I = 1, NPARMS ) - ELSE - DO 630 I = 1, NPARMS - IACC22( I ) = 1 - 630 CONTINUE - END IF -* -* Read the values for NBCOL. -* - IF( DGG ) THEN - READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) - DO 160 I = 1, NPARMS - IF( NBCOL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBCOL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX - FATAL = .TRUE. - END IF - 160 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBCOL:', - $ ( NBCOL( I ), I = 1, NPARMS ) - ELSE - DO 170 I = 1, NPARMS - NBCOL( I ) = 1 - 170 CONTINUE - END IF - END IF -* -* Calculate and print the machine dependent constants. -* - WRITE( NOUT, FMT = * ) - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9981 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9981 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9981 )'precision', EPS -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9982 )THRESH - IF( SEP .OR. SVD .OR. DGG ) THEN -* -* Read the flag that indicates whether to test LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test driver routines. -* - READ( NIN, FMT = * )TSTDRV - END IF -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* -* Read the code describing how to set the random number seed. -* - READ( NIN, FMT = * )NEWSD -* -* If NEWSD = 2, read another line with 4 integers for the seed. -* - IF( NEWSD.EQ.2 ) - $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) -* - DO 180 I = 1, 4 - ISEED( I ) = IOLDSD( I ) - 180 CONTINUE -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Read the input lines indicating the test path and its parameters. -* The first three characters indicate the test path, and the number -* of test matrix types must be the first nonblank item in columns -* 4-80. -* - 190 CONTINUE -* - IF( .NOT.( DGX .OR. DXV ) ) THEN -* - 200 CONTINUE - READ( NIN, FMT = '(A80)', END = 380 )LINE - C3 = LINE( 1: 3 ) - LENP = LEN( LINE ) - I = 3 - ITMP = 0 - I1 = 0 - 210 CONTINUE - I = I + 1 - IF( I.GT.LENP ) THEN - IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - NTYPES = MAXT - GO TO 240 - END IF - END IF - IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN - I1 = I - C1 = LINE( I1: I1 ) -* -* Check that a valid integer was read -* - DO 220 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 230 - END IF - 220 CONTINUE - WRITE( NOUT, FMT = 9991 )I, LINE - GO TO 200 - 230 CONTINUE - ITMP = 10*ITMP + IC - GO TO 210 - ELSE IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - GO TO 210 - END IF - 240 CONTINUE - NTYPES = ITMP -* -* Skip the tests if NTYPES is <= 0. -* - IF( .NOT.( DEV .OR. DES .OR. DVX .OR. DSX .OR. DGV .OR. - $ DGS ) .AND. NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - GO TO 200 - END IF -* - ELSE - IF( DXV ) - $ C3 = 'DXV' - IF( DGX ) - $ C3 = 'DGX' - END IF -* -* Reset the random number seed. -* - IF( NEWSD.EQ.0 ) THEN - DO 250 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 250 CONTINUE - END IF -* - IF( LSAMEN( 3, C3, 'DHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN -* -* ------------------------------------- -* NEP: Nonsymmetric Eigenvalue Problem -* ------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NS = number of shifts -* MAXB = minimum submatrix size -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL DERRHS( 'DHSEQR', NOUT ) - DO 270 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) - CALL XLAENV(13, INWIN( I ) ) - CALL XLAENV(14, INIBL( I ) ) - CALL XLAENV(15, ISHFTS( I ) ) - CALL XLAENV(16, IACC22( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 260 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 260 CONTINUE - END IF - WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), MAX( 11, INMIN(I)), - $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) - CALL DCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), - $ A( 1, 7 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), A( 1, 8 ), - $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ D( 1, 7 ), WORK, LWORK, IWORK, LOGWRK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKHS', INFO - 270 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'DST' ) .OR. LSAMEN( 3, C3, 'SEP' ) - $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN -* -* ---------------------------------- -* SEP: Symmetric Eigenvalue Problem -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL DERRST( 'DST', NOUT ) - DO 290 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 280 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 280 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL DCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), - $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL DCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), - $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKST', INFO - END IF - IF( TSTDRV ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL DDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 2 ), NMAX, - $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL DDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 2 ), NMAX, - $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRVST', INFO - END IF - 290 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'DSG' ) ) THEN -* -* ---------------------------------------------- -* DSG: Symmetric Generalized Eigenvalue Problem -* ---------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) - DO 310 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 300 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 300 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN -* CALL DDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, -* $ D( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), -* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, -* $ LWORK, IWORK, LIWORK, RESULT, INFO ) - CALL DDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ D( 1, 3 ), D( 1, 3 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), - $ A( 1, 7 ), WORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRVSG', INFO - END IF - 310 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'DBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN -* -* ---------------------------------- -* SVD: Singular Value Decomposition -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NRHS = number of right hand sides -* - MAXTYP = 16 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) -* -* Test the error exits -* - IF( TSTERR .AND. TSTCHK ) - $ CALL DERRBD( 'DBD', NOUT ) - IF( TSTERR .AND. TSTDRV ) - $ CALL DERRED( 'DBD', NOUT ) -* - DO 330 I = 1, NPARMS - NRHS = NSVAL( I ) - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - IF( NEWSD.EQ.0 ) THEN - DO 320 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 320 CONTINUE - END IF - WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), NRHS - IF( TSTCHK ) THEN - CALL DCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, - $ THRESH, A( 1, 1 ), NMAX, D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 2 ), - $ NMAX, A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), NMAX, - $ A( 1, 6 ), NMAX, A( 1, 7 ), A( 1, 8 ), WORK, - $ LWORK, IWORK, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKBD', INFO - END IF - IF( TSTDRV ) - $ CALL DDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ WORK, LWORK, IWORK, NOUT, INFO ) - 330 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'DEV' ) ) THEN -* -* -------------------------------------------- -* DEV: Nonsymmetric Eigenvalue Problem Driver -* DGEEV (eigenvalues and eigenvectors) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), - $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, RESULT, - $ WORK, LWORK, IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DGEEV', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DES' ) ) THEN -* -* -------------------------------------------- -* DES: Nonsymmetric Eigenvalue Problem Driver -* DGEES (Schur form) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ A( 1, 4 ), NMAX, RESULT, WORK, LWORK, IWORK, - $ LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DGEES', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DVX' ) ) THEN -* -* -------------------------------------------------------------- -* DVX: Nonsymmetric Eigenvalue Problem Expert Driver -* DGEEVX (eigenvalues, eigenvectors and condition numbers) -* -------------------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), - $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, - $ D( 1, 5 ), D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), - $ D( 1, 9 ), D( 1, 10 ), D( 1, 11 ), D( 1, 12 ), - $ RESULT, WORK, LWORK, IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DGEEVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DSX' ) ) THEN -* -* --------------------------------------------------- -* DSX: Nonsymmetric Eigenvalue Problem Expert Driver -* DGEESX (Schur form and condition numbers) -* --------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), A( 1, 4 ), NMAX, - $ A( 1, 5 ), RESULT, WORK, LWORK, IWORK, LOGWRK, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DGEESX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DGG' ) ) THEN -* -* ------------------------------------------------- -* DGG: Generalized Nonsymmetric Eigenvalue Problem -* ------------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NS = number of shifts -* MAXB = minimum submatrix size -* IACC22: structured matrix multiply -* NBCOL = minimum column dimension for blocks -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV(1,1) - IF( TSTCHK .AND. TSTERR ) - $ CALL DERRGG( C3, NOUT ) - DO 350 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 4, NSVAL( I ) ) - CALL XLAENV( 8, MXBVAL( I ) ) - CALL XLAENV( 16, IACC22( I ) ) - CALL XLAENV( 5, NBCOL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 340 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 340 CONTINUE - END IF - WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), - $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) - TSTDIF = .FALSE. - THRSHN = 10.D0 - IF( TSTCHK ) THEN - CALL DCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), - $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), A( 1, 13 ), - $ A( 1, 14 ), WORK, LWORK, LOGWRK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKGG', INFO - END IF - 350 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'DGS' ) ) THEN -* -* ------------------------------------------------- -* DGS: Generalized Nonsymmetric Eigenvalue Problem -* DGGES (Schur form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, - $ RESULT, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGES', INFO -* -* Blocked version -* - CALL XLAENV(16, 2) - CALL DDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, - $ RESULT, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGES3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( DGX ) THEN -* -* ------------------------------------------------- -* DGX: Generalized Nonsymmetric Eigenvalue Problem -* DGGESX (Schur form and condition numbers) -* ------------------------------------------------- -* - MAXTYP = 5 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 5, 2 ) - CALL DDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ C( 1, 1 ), NCMAX*NCMAX, A( 1, 12 ), WORK, - $ LWORK, IWORK, LIWORK, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGSX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DGV' ) ) THEN -* -* ------------------------------------------------- -* DGV: Generalized Nonsymmetric Eigenvalue Problem -* DGGEV (Eigenvalue/vector form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), - $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), - $ WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGEV', INFO -* -* Blocked version -* - CALL DDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), - $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), - $ WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGEV3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( DXV ) THEN -* -* ------------------------------------------------- -* DXV: Generalized Nonsymmetric Eigenvalue Problem -* DGGEVX (eigenvalue/vector with condition numbers) -* ------------------------------------------------- -* - MAXTYP = 2 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL DERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL DDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), A( 1, 5 ), A( 1, 6 ), - $ IWORK( 1 ), IWORK( 2 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ WORK, LWORK, IWORK( 3 ), LIWORK-2, RESULT, - $ LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DDRGVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'DSB' ) ) THEN -* -* ------------------------------ -* DSB: Symmetric Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL DERRST( 'DSB', NOUT ) -* CALL DCHKSB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), D( 1, 2 ), -* $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) - CALL DCHKSB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKSB', INFO -* - ELSE IF( LSAMEN( 3, C3, 'DBB' ) ) THEN -* -* ------------------------------ -* DBB: General Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - DO 370 I = 1, NPARMS - NRHS = NSVAL( I ) -* - IF( NEWSD.EQ.0 ) THEN - DO 360 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 360 CONTINUE - END IF - WRITE( NOUT, FMT = 9966 )C3, NRHS - CALL DCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, - $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), 2*NMAX, D( 1, 1 ), D( 1, 2 ), - $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), - $ NMAX, A( 1, 7 ), WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCHKBB', INFO - 370 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN -* -* ----------------------------------------- -* GLM: Generalized Linear Regression Model -* ----------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL DERRGG( 'GLM', NOUT ) - CALL DCKGLM( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCKGLM', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN -* -* ------------------------------------------ -* GQR: Generalized QR and RQ factorizations -* ------------------------------------------ -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL DERRGG( 'GQR', NOUT ) - CALL DCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, - $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, D( 1, 1 ), NIN, - $ NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCKGQR', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN -* -* ---------------------------------------------- -* GSV: Generalized Singular Value Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL DERRGG( 'GSV', NOUT ) - CALL DCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), TAUA, TAUB, - $ B( 1, 4 ), IWORK, WORK, D( 1, 1 ), NIN, NOUT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCKGSV', INFO -* - ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN -* -* ---------------------------------------------- -* CSD: CS Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL DERRGG( 'CSD', NOUT ) - CALL DCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), IWORK, WORK, - $ D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCKCSD', INFO -* - ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN -* -* -------------------------------------- -* LSE: Constrained Linear Least Squares -* -------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL DERRGG( 'LSE', NOUT ) - CALL DCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'DCKLSE', INFO -* - ELSE - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = 9992 )C3 - END IF - IF( .NOT.( DGX .OR. DXV ) ) - $ GO TO 190 - 380 CONTINUE - WRITE( NOUT, FMT = 9994 ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) - 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, - $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) - 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', NRHS =', I4 ) - 9994 FORMAT( / / ' End of tests' ) - 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) - 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, - $ ' of input', ' line:', / A79 ) - 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) - 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', - $ I6 ) - 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', - $ I6 ) - 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) - 9986 FORMAT( ' Tests of the Symmetric Eigenvalue Problem routines' ) - 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) - 9984 FORMAT( / ' The following parameter values will be used:' ) - 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) - 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9981 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) - 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' DGEEV (eigenvalues and eigevectors)' ) - 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' DGEES (Schur form)' ) - 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' DGEEVX (eigenvalues, eigenvectors and', - $ ' condition numbers)' ) - 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' DGEESX (Schur form and condition', - $ ' numbers)' ) - 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem routines' ) - 9974 FORMAT( ' Tests of DSBTRD', / ' (reduction of a symmetric band ', - $ 'matrix to tridiagonal form)' ) - 9973 FORMAT( / 1X, 71( '-' ) ) - 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) - 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', - $ 'routines' ) - 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) - 9969 FORMAT( / ' Tests of the Generalized Singular Value', - $ ' Decomposition routines' ) - 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) - 9967 FORMAT( ' Tests of DGBBRD', / ' (reduction of a general band ', - $ 'matrix to real bidiagonal form)' ) - 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) - 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver DGGESX' ) - 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver DGGES' ) - 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver DGGEV' ) - 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver DGGEVX' ) - 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', INMIN=', I4, - $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, - $ ', IACC22 =', I4) - 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) -* -* End of DCHKEE -* - END From 0e96c378fde1e9587dcfec35af221ee8cc3c90cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:46:52 +0100 Subject: [PATCH 124/681] Delete schkee.f --- lapack-netlib/TESTING/EIG/schkee.f | 2510 ---------------------------- 1 file changed, 2510 deletions(-) delete mode 100644 lapack-netlib/TESTING/EIG/schkee.f diff --git a/lapack-netlib/TESTING/EIG/schkee.f b/lapack-netlib/TESTING/EIG/schkee.f deleted file mode 100644 index 3757e0655..000000000 --- a/lapack-netlib/TESTING/EIG/schkee.f +++ /dev/null @@ -1,2510 +0,0 @@ -*> \brief \b SCHKEE -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM SCHKEE -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> SCHKEE tests the REAL LAPACK subroutines for the matrix -*> eigenvalue problem. The test paths in this version are -*> -*> NEP (Nonsymmetric Eigenvalue Problem): -*> Test SGEHRD, SORGHR, SHSEQR, STREVC, SHSEIN, and SORMHR -*> -*> SEP (Symmetric Eigenvalue Problem): -*> Test SSYTRD, SORGTR, SSTEQR, SSTERF, SSTEIN, SSTEDC, -*> and drivers SSYEV(X), SSBEV(X), SSPEV(X), SSTEV(X), -*> SSYEVD, SSBEVD, SSPEVD, SSTEVD -*> -*> SVD (Singular Value Decomposition): -*> Test SGEBRD, SORGBR, SBDSQR, SBDSDC -*> and the drivers SGESVD, SGESDD -*> -*> SEV (Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test SGEEV -*> -*> SES (Nonsymmetric Schur form Driver): -*> Test SGEES -*> -*> SVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test SGEEVX -*> -*> SSX (Nonsymmetric Schur form Expert Driver): -*> Test SGEESX -*> -*> SGG (Generalized Nonsymmetric Eigenvalue Problem): -*> Test SGGHD3, SGGBAL, SGGBAK, SHGEQZ, and STGEVC -*> -*> SGS (Generalized Nonsymmetric Schur form Driver): -*> Test SGGES -*> -*> SGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test SGGEV -*> -*> SGX (Generalized Nonsymmetric Schur form Expert Driver): -*> Test SGGESX -*> -*> SXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test SGGEVX -*> -*> SSG (Symmetric Generalized Eigenvalue Problem): -*> Test SSYGST, SSYGV, SSYGVD, SSYGVX, SSPGST, SSPGV, SSPGVD, -*> SSPGVX, SSBGST, SSBGV, SSBGVD, and SSBGVX -*> -*> SSB (Symmetric Band Eigenvalue Problem): -*> Test SSBTRD -*> -*> SBB (Band Singular Value Decomposition): -*> Test SGBBRD -*> -*> SEC (Eigencondition estimation): -*> Test SLALN2, SLASY2, SLAEQU, SLAEXC, STRSYL, STREXC, STRSNA, -*> STRSEN, and SLAQTR -*> -*> SBL (Balancing a general matrix) -*> Test SGEBAL -*> -*> SBK (Back transformation on a balanced matrix) -*> Test SGEBAK -*> -*> SGL (Balancing a matrix pair) -*> Test SGGBAL -*> -*> SGK (Back transformation on a matrix pair) -*> Test SGGBAK -*> -*> GLM (Generalized Linear Regression Model): -*> Tests SGGGLM -*> -*> GQR (Generalized QR and RQ factorizations): -*> Tests SGGQRF and SGGRQF -*> -*> GSV (Generalized Singular Value Decomposition): -*> Tests SGGSVD, SGGSVP, STGSJA, SLAGS2, SLAPLL, and SLAPMT -*> -*> CSD (CS decomposition): -*> Tests SORCSD -*> -*> LSE (Constrained Linear Least Squares): -*> Tests SGGLSE -*> -*> Each test path has a different set of inputs, but the data sets for -*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a -*> single input file. The first line of input should contain one of the -*> 3-character path names in columns 1-3. The number of remaining lines -*> depends on what is found on the first line. -*> -*> The number of matrix types used in testing is often controllable from -*> the input file. The number of matrix types for each path, and the -*> test routine that describes them, is as follows: -*> -*> Path name(s) Types Test routine -*> -*> SHS or NEP 21 SCHKHS -*> SST or SEP 21 SCHKST (routines) -*> 18 SDRVST (drivers) -*> SBD or SVD 16 SCHKBD (routines) -*> 5 SDRVBD (drivers) -*> SEV 21 SDRVEV -*> SES 21 SDRVES -*> SVX 21 SDRVVX -*> SSX 21 SDRVSX -*> SGG 26 SCHKGG (routines) -*> SGS 26 SDRGES -*> SGX 5 SDRGSX -*> SGV 26 SDRGEV -*> SXV 2 SDRGVX -*> SSG 21 SDRVSG -*> SSB 15 SCHKSB -*> SBB 15 SCHKBB -*> SEC - SCHKEC -*> SBL - SCHKBL -*> SBK - SCHKBK -*> SGL - SCHKGL -*> SGK - SCHKGK -*> GLM 8 SCKGLM -*> GQR 8 SCKGQR -*> GSV 8 SCKGSV -*> CSD 3 SCKCSD -*> LSE 8 SCKLSE -*> -*>----------------------------------------------------------------------- -*> -*> NEP input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NX, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: INMIN, INTEGER array, dimension (NPARMS) -*> LAHQR vs TTQRE crossover point, >= 11 -*> -*> line 9: INWIN, INTEGER array, dimension (NPARMS) -*> recommended deflation window size -*> -*> line 10: INIBL, INTEGER array, dimension (NPARMS) -*> nibble crossover point -*> -*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) -*> number of simultaneous shifts) -*> -*> line 12: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 0, 1 or 2) -*> -*> line 13: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. To have all of the test -*> ratios printed, use THRESH = 0.0 . -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow -*> the user to specify the matrix types. Each line contains -*> a 3-character path name in columns 1-3, and the number -*> of matrix types must be the first nonblank item in columns -*> 4-80. If the number of matrix types is at least 1 but is -*> less than the maximum number of possible types, a second -*> line will be read to get the numbers of the matrix types to -*> be used. For example, -*> NEP 21 -*> requests all of the matrix types for the nonsymmetric -*> eigenvalue problem, while -*> NEP 4 -*> 9 10 11 12 -*> requests only matrices of type 9, 10, 11, and 12. -*> -*> The valid 3-character path names are 'NEP' or 'SHS' for the -*> nonsymmetric eigenvalue routines. -*> -*>----------------------------------------------------------------------- -*> -*> SEP or SSG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, and NX. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 10: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 11: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 12: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 12 was 2: -*> -*> line 13: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 13-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SEP' or 'SST' for the -*> symmetric eigenvalue routines and driver routines, and -*> 'SSG' for the routines for the symmetric generalized -*> eigenvalue problem. -*> -*>----------------------------------------------------------------------- -*> -*> SVD input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 5: NPARMS, INTEGER -*> Number of values of the parameter NB, NBMIN, NX, and NRHS. -*> -*> line 6: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 7: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 8: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 9: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 10: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 11: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 12: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 13: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SVD' or 'SBD' for both the -*> SVD routines and the SVD driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> SEV and SES data files: -*> -*> line 1: 'SEV' or 'SES' in columns 1 to 3. -*> -*> line 2: NSIZES, INTEGER -*> Number of sizes of matrices to use. Should be at least 0 -*> and at most 20. If NSIZES = 0, no testing is done -*> (although the remaining 3 lines are still read). -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHSEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9 and following: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'SEV' to test SGEEV, or -*> 'SES' to test SGEES. -*> -*>----------------------------------------------------------------------- -*> -*> The SVX data has two parts. The first part is identical to SEV, -*> and the second part consists of test matrices with precomputed -*> solutions. -*> -*> line 1: 'SVX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: TSTERR, LOGICAL -*> -*> line 7: NEWSD, INTEGER -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> -*> lines 9 and following: The first line contains 'SVX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 1+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N lines contain the matrix, one -*> row per line. The last N lines correspond to each -*> eigenvalue. Each of these last N lines contains 4 real -*> values: the real part of the eigenvalue, the imaginary -*> part of the eigenvalue, the reciprocal condition number of -*> the eigenvalues, and the reciprocal condition number of the -*> eigenvector. The end of data is indicated by dimension N=0. -*> Even if no data is to be tested, there must be at least one -*> line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> The SSX data is like SVX. The first part is identical to SEV, and the -*> second part consists of test matrices with precomputed solutions. -*> -*> line 1: 'SSX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: TSTERR, LOGICAL -*> -*> line 7: NEWSD, INTEGER -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> -*> lines 9 and following: The first line contains 'SSX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 3+N lines, where N is its -*> dimension. The first line contains the dimension N and the -*> dimension M of an invariant subspace. The second line -*> contains M integers, identifying the eigenvalues in the -*> invariant subspace (by their position in a list of -*> eigenvalues ordered by increasing real part). The next N -*> lines contain the matrix. The last line contains the -*> reciprocal condition number for the average of the selected -*> eigenvalues, and the reciprocal condition number for the -*> corresponding right invariant subspace. The end of data is -*> indicated by a line containing N=0 and M=0. Even if no data -*> is to be tested, there must be at least one line containing -*> N=0 and M=0. -*> -*>----------------------------------------------------------------------- -*> -*> SGG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NS, MAXB, and -*> NBCOL. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for NBMIN, the minimum row dimension for blocks. -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of shifts. -*> -*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) -*> The values for MAXB, used in determining minimum blocksize. -*> -*> line 9: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 1 or 2) -*> -*> line 10: NBCOL, INTEGER array, dimension (NPARMS) -*> The values for NBCOL, the minimum column dimension for -*> blocks. -*> -*> line 11: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 12: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 13: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 14: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 15: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 15 was 2: -*> -*> line 16: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 17-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'SGG' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> SGS and SGV input files: -*> -*> line 1: 'SGS' or 'SGV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension(NN) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 17 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 7-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'SGS' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> SXV input files: -*> -*> line 1: 'SXV' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N lines contain the matrix A, one -*> row per line. The next N lines contain the matrix B. The -*> next line contains the reciprocals of the eigenvalue -*> condition numbers. The last line contains the reciprocals of -*> the eigenvector condition numbers. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> SGX input files: -*> -*> line 1: 'SGX' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next line contains an integer k such -*> that only the last k eigenvalues will be selected and appear -*> in the leading diagonal blocks of $A$ and $B$. The next N -*> lines contain the matrix A, one row per line. The next N -*> lines contain the matrix B. The last line contains the -*> reciprocal of the eigenvalue cluster condition number and the -*> reciprocal of the deflating subspace (associated with the -*> selected eigencluster) condition number. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> SSB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix dimension K. -*> -*> line 6: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'SSB'. -*> -*>----------------------------------------------------------------------- -*> -*> SBB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix bandwidth K. -*> -*> line 6: NPARMS, INTEGER -*> Number of values of the parameter NRHS -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 9 was 2: -*> -*> line 10: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 10-EOF: Lines specifying matrix types, as for SVD. -*> The 3-character path name is 'SBB'. -*> -*>----------------------------------------------------------------------- -*> -*> SEC input file: -*> -*> line 2: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> lines 3-EOF: -*> -*> Input for testing the eigencondition routines consists of a set of -*> specially constructed test cases and their solutions. The data -*> format is not intended to be modified by the user. -*> -*>----------------------------------------------------------------------- -*> -*> SBL and SBK input files: -*> -*> line 1: 'SBL' in columns 1-3 to test SGEBAL, or 'SBK' in -*> columns 1-3 to test SGEBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> SGL and SGK input files: -*> -*> line 1: 'SGL' in columns 1-3 to test SGGBAL, or 'SGK' in -*> columns 1-3 to test SGGBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> GLM data file: -*> -*> line 1: 'GLM' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension), note M <= N <= M+P. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GLM' for the generalized -*> linear regression model routines. -*> -*>----------------------------------------------------------------------- -*> -*> GQR data file: -*> -*> line 1: 'GQR' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GQR' for the generalized -*> QR and RQ routines. -*> -*>----------------------------------------------------------------------- -*> -*> GSV data file: -*> -*> line 1: 'GSV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> CSD data file: -*> -*> line 1: 'CSD' in columns 1 to 3. -*> -*> line 2: NM, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NM) -*> Values of M (row and column dimension of orthogonal matrix). -*> -*> line 4: PVAL, INTEGER array, dimension(NM) -*> Values of P (row dimension of top-left block). -*> -*> line 5: NVAL, INTEGER array, dimension(NM) -*> Values of N (column dimension of top-left block). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CSD' for the CSD routine. -*> -*>----------------------------------------------------------------------- -*> -*> LSE data file: -*> -*> line 1: 'LSE' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N, note P <= N <= P+M. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> NMAX is currently set to 132 and must be at least 12 for some of the -*> precomputed examples, and LWORK = NMAX*(5*NMAX+5)+1 in the parameter -*> statements below. For SVD, we assume NRHS may be as big as N. The -*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for SGG. -*> \endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date June 2016 -* -*> \ingroup single_eig -* -* ===================================================================== - PROGRAM SCHKEE -* -* -- LAPACK test routine (version 3.7.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2016 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER NCMAX - PARAMETER ( NCMAX = 20 ) - INTEGER NEED - PARAMETER ( NEED = 14 ) - INTEGER LWORK - PARAMETER ( LWORK = NMAX*( 5*NMAX+5 )+1 ) - INTEGER LIWORK - PARAMETER ( LIWORK = NMAX*( 5*NMAX+20 ) ) - INTEGER MAXIN - PARAMETER ( MAXIN = 20 ) - INTEGER MAXT - PARAMETER ( MAXT = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) -* .. -* .. Local Scalars .. - LOGICAL CSD, FATAL, GLM, GQR, GSV, LSE, NEP, SBB, SBK, - $ SBL, SEP, SES, SEV, SGG, SGK, SGL, SGS, SGV, - $ SGX, SSB, SSX, SVD, SVX, SXV, TSTCHK, TSTDIF, - $ TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*3 C3, PATH - CHARACTER*32 VNAME - CHARACTER*10 INTSTR - CHARACTER*80 LINE - INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, - $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THRESH, THRSHN -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) - INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), - $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), - $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ PVAL( MAXIN ) - INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), - $ ISHFTS( MAXIN ), IACC22( MAXIN ) - REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) -* .. -* .. External Functions .. - LOGICAL LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, SCHKBB, SCHKBD, SCHKBK, SCHKBL, SCHKEC, - $ SCHKGG, SCHKGK, SCHKGL, SCHKHS, SCHKSB, SCHKST, - $ SCKCSD, SCKGLM, SCKGQR, SCKGSV, SCKLSE, SDRGES, - $ SDRGEV, SDRGSX, SDRGVX, SDRVBD, SDRVES, SDRVEV, - $ SDRVSG, SDRVST, SDRVSX, SDRVVX, SERRBD, - $ SERRED, SERRGG, SERRHS, SERRST, ILAVER, XLAENV, - $ SDRGES3, SDRGEV3, - $ SCHKST2STG, SDRVST2STG, SCHKSB2STG, SDRVSG2STG -* .. -* .. Intrinsic Functions .. - INTRINSIC LEN, MIN -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, - $ SELOPT -* .. -* .. Arrays in Common .. - LOGICAL SELVAL( 20 ) - INTEGER IPARMS( 100 ) - REAL SELWI( 20 ), SELWR( 20 ) -* .. -* .. Common blocks .. - COMMON / CENVIR / NPROC, NSHIFT, MAXB - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI -* .. -* .. Data statements .. - DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / -* .. -* .. Executable Statements .. -* - A = 0.0 - B = 0.0 - C = 0.0 - D = 0.0 - S1 = SECOND( ) - FATAL = .FALSE. - NUNIT = NOUT -* -* Return to here to read multiple sets of data -* - 10 CONTINUE -* -* Read the first line and set the 3-character test path -* - READ( NIN, FMT = '(A80)', END = 380 )LINE - PATH = LINE( 1: 3 ) - NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'SHS' ) - SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'SST' ) .OR. - $ LSAMEN( 3, PATH, 'SSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) - SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'DBD' ) - SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'SBD' ) - SEV = LSAMEN( 3, PATH, 'SEV' ) - SES = LSAMEN( 3, PATH, 'SES' ) - SVX = LSAMEN( 3, PATH, 'SVX' ) - SSX = LSAMEN( 3, PATH, 'SSX' ) - SGG = LSAMEN( 3, PATH, 'SGG' ) - SGS = LSAMEN( 3, PATH, 'SGS' ) - SGX = LSAMEN( 3, PATH, 'SGX' ) - SGV = LSAMEN( 3, PATH, 'SGV' ) - SXV = LSAMEN( 3, PATH, 'SXV' ) - SSB = LSAMEN( 3, PATH, 'SSB' ) - SBB = LSAMEN( 3, PATH, 'SBB' ) - GLM = LSAMEN( 3, PATH, 'GLM' ) - GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) - GSV = LSAMEN( 3, PATH, 'GSV' ) - CSD = LSAMEN( 3, PATH, 'CSD' ) - LSE = LSAMEN( 3, PATH, 'LSE' ) - SBL = LSAMEN( 3, PATH, 'SBL' ) - SBK = LSAMEN( 3, PATH, 'SBK' ) - SGL = LSAMEN( 3, PATH, 'SGL' ) - SGK = LSAMEN( 3, PATH, 'SGK' ) -* -* Report values of parameters. -* - IF( PATH.EQ.' ' ) THEN - GO TO 10 - ELSE IF( NEP ) THEN - WRITE( NOUT, FMT = 9987 ) - ELSE IF( SEP ) THEN - WRITE( NOUT, FMT = 9986 ) - ELSE IF( SVD ) THEN - WRITE( NOUT, FMT = 9985 ) - ELSE IF( SEV ) THEN - WRITE( NOUT, FMT = 9979 ) - ELSE IF( SES ) THEN - WRITE( NOUT, FMT = 9978 ) - ELSE IF( SVX ) THEN - WRITE( NOUT, FMT = 9977 ) - ELSE IF( SSX ) THEN - WRITE( NOUT, FMT = 9976 ) - ELSE IF( SGG ) THEN - WRITE( NOUT, FMT = 9975 ) - ELSE IF( SGS ) THEN - WRITE( NOUT, FMT = 9964 ) - ELSE IF( SGX ) THEN - WRITE( NOUT, FMT = 9965 ) - ELSE IF( SGV ) THEN - WRITE( NOUT, FMT = 9963 ) - ELSE IF( SXV ) THEN - WRITE( NOUT, FMT = 9962 ) - ELSE IF( SSB ) THEN - WRITE( NOUT, FMT = 9974 ) - ELSE IF( SBB ) THEN - WRITE( NOUT, FMT = 9967 ) - ELSE IF( GLM ) THEN - WRITE( NOUT, FMT = 9971 ) - ELSE IF( GQR ) THEN - WRITE( NOUT, FMT = 9970 ) - ELSE IF( GSV ) THEN - WRITE( NOUT, FMT = 9969 ) - ELSE IF( CSD ) THEN - WRITE( NOUT, FMT = 9960 ) - ELSE IF( LSE ) THEN - WRITE( NOUT, FMT = 9968 ) - ELSE IF( SBL ) THEN -* -* SGEBAL: Balancing -* - CALL SCHKBL( NIN, NOUT ) - GO TO 10 - ELSE IF( SBK ) THEN -* -* SGEBAK: Back transformation -* - CALL SCHKBK( NIN, NOUT ) - GO TO 10 - ELSE IF( SGL ) THEN -* -* SGGBAL: Balancing -* - CALL SCHKGL( NIN, NOUT ) - GO TO 10 - ELSE IF( SGK ) THEN -* -* SGGBAK: Back transformation -* - CALL SCHKGK( NIN, NOUT ) - GO TO 10 - ELSE IF( LSAMEN( 3, PATH, 'SEC' ) ) THEN -* -* SEC: Eigencondition estimation -* - READ( NIN, FMT = * )THRESH - CALL XLAENV( 1, 1 ) - CALL XLAENV( 12, 11 ) - CALL XLAENV( 13, 2 ) - CALL XLAENV( 14, 0 ) - CALL XLAENV( 15, 2 ) - CALL XLAENV( 16, 2 ) - TSTERR = .TRUE. - CALL SCHKEC( THRESH, TSTERR, NIN, NOUT ) - GO TO 10 - ELSE - WRITE( NOUT, FMT = 9992 )PATH - GO TO 10 - END IF - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH - WRITE( NOUT, FMT = 9984 ) -* -* Read the number of values of M, P, and N. -* - READ( NIN, FMT = * )NN - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF -* -* Read the values of M -* - IF( .NOT.( SGX .OR. SXV ) ) THEN - READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) - IF( SVD ) THEN - VNAME = ' M ' - ELSE - VNAME = ' N ' - END IF - DO 20 I = 1, NN - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) - END IF -* -* Read the values of P -* - IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN - READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) - DO 30 I = 1, NN - IF( PVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( PVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX - FATAL = .TRUE. - END IF - 30 CONTINUE - WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) - END IF -* -* Read the values of N -* - IF( SVD .OR. SBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. - $ LSE ) THEN - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 40 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 40 CONTINUE - ELSE - DO 50 I = 1, NN - NVAL( I ) = MVAL( I ) - 50 CONTINUE - END IF - IF( .NOT.( SGX .OR. SXV ) ) THEN - WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) - ELSE - WRITE( NOUT, FMT = 9983 )'N: ', NN - END IF -* -* Read the number of values of K, followed by the values of K -* - IF( SSB .OR. SBB ) THEN - READ( NIN, FMT = * )NK - READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) - DO 60 I = 1, NK - IF( KVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( KVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX - FATAL = .TRUE. - END IF - 60 CONTINUE - WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) - END IF -* - IF( SEV .OR. SES .OR. SVX .OR. SSX ) THEN -* -* For the nonsymmetric QR driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INWIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INIBL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( ISHFTS( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 - FATAL = .TRUE. - ELSE IF( IACC22( 1 ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) - CALL XLAENV(13, INWIN( 1 ) ) - CALL XLAENV(14, INIBL( 1 ) ) - CALL XLAENV(15, ISHFTS( 1 ) ) - CALL XLAENV(16, IACC22( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) - WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) -* - ELSE IF( SGS .OR. SGX .OR. SGV .OR. SXV ) THEN -* -* For the nonsymmetric generalized driver routines, only one set -* of parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ NSVAL( 1 ), MXBVAL( 1 ) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NSVAL( 1 ).LT.2 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 - FATAL = .TRUE. - ELSE IF( MXBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV( 4, NSVAL( 1 ) ) - CALL XLAENV( 8, MXBVAL( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) -* - ELSE IF( .NOT.SSB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. - $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN -* -* For the other paths, the number of parameters can be varied -* from the input file. Read the number of parameter values. -* - READ( NIN, FMT = * )NPARMS - IF( NPARMS.LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 - NPARMS = 0 - FATAL = .TRUE. - ELSE IF( NPARMS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN - NPARMS = 0 - FATAL = .TRUE. - END IF -* -* Read the values of NB -* - IF( .NOT.SBB ) THEN - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) - DO 70 I = 1, NPARMS - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 70 CONTINUE - WRITE( NOUT, FMT = 9983 )'NB: ', - $ ( NBVAL( I ), I = 1, NPARMS ) - END IF -* -* Read the values of NBMIN -* - IF( NEP .OR. SEP .OR. SVD .OR. SGG ) THEN - READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) - DO 80 I = 1, NPARMS - IF( NBMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 - FATAL = .TRUE. - ELSE IF( NBMIN( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX - FATAL = .TRUE. - END IF - 80 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBMIN:', - $ ( NBMIN( I ), I = 1, NPARMS ) - ELSE - DO 90 I = 1, NPARMS - NBMIN( I ) = 1 - 90 CONTINUE - END IF -* -* Read the values of NX -* - IF( NEP .OR. SEP .OR. SVD ) THEN - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) - DO 100 I = 1, NPARMS - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NXVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX - FATAL = .TRUE. - END IF - 100 CONTINUE - WRITE( NOUT, FMT = 9983 )'NX: ', - $ ( NXVAL( I ), I = 1, NPARMS ) - ELSE - DO 110 I = 1, NPARMS - NXVAL( I ) = 1 - 110 CONTINUE - END IF -* -* Read the values of NSHIFT (if SGG) or NRHS (if SVD -* or SBB). -* - IF( SVD .OR. SBB .OR. SGG ) THEN - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) - DO 120 I = 1, NPARMS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX - FATAL = .TRUE. - END IF - 120 CONTINUE - WRITE( NOUT, FMT = 9983 )'NS: ', - $ ( NSVAL( I ), I = 1, NPARMS ) - ELSE - DO 130 I = 1, NPARMS - NSVAL( I ) = 1 - 130 CONTINUE - END IF -* -* Read the values for MAXB. -* - IF( SGG ) THEN - READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) - DO 140 I = 1, NPARMS - IF( MXBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MXBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 140 CONTINUE - WRITE( NOUT, FMT = 9983 )'MAXB: ', - $ ( MXBVAL( I ), I = 1, NPARMS ) - ELSE - DO 150 I = 1, NPARMS - MXBVAL( I ) = 1 - 150 CONTINUE - END IF -* -* Read the values for INMIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) - DO 540 I = 1, NPARMS - IF( INMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 - FATAL = .TRUE. - END IF - 540 CONTINUE - WRITE( NOUT, FMT = 9983 )'INMIN: ', - $ ( INMIN( I ), I = 1, NPARMS ) - ELSE - DO 550 I = 1, NPARMS - INMIN( I ) = 1 - 550 CONTINUE - END IF -* -* Read the values for INWIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) - DO 560 I = 1, NPARMS - IF( INWIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 - FATAL = .TRUE. - END IF - 560 CONTINUE - WRITE( NOUT, FMT = 9983 )'INWIN: ', - $ ( INWIN( I ), I = 1, NPARMS ) - ELSE - DO 570 I = 1, NPARMS - INWIN( I ) = 1 - 570 CONTINUE - END IF -* -* Read the values for INIBL. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) - DO 580 I = 1, NPARMS - IF( INIBL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 - FATAL = .TRUE. - END IF - 580 CONTINUE - WRITE( NOUT, FMT = 9983 )'INIBL: ', - $ ( INIBL( I ), I = 1, NPARMS ) - ELSE - DO 590 I = 1, NPARMS - INIBL( I ) = 1 - 590 CONTINUE - END IF -* -* Read the values for ISHFTS. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) - DO 600 I = 1, NPARMS - IF( ISHFTS( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 - FATAL = .TRUE. - END IF - 600 CONTINUE - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', - $ ( ISHFTS( I ), I = 1, NPARMS ) - ELSE - DO 610 I = 1, NPARMS - ISHFTS( I ) = 1 - 610 CONTINUE - END IF -* -* Read the values for IACC22. -* - IF( NEP .OR. SGG ) THEN - READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) - DO 620 I = 1, NPARMS - IF( IACC22( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 - FATAL = .TRUE. - END IF - 620 CONTINUE - WRITE( NOUT, FMT = 9983 )'IACC22: ', - $ ( IACC22( I ), I = 1, NPARMS ) - ELSE - DO 630 I = 1, NPARMS - IACC22( I ) = 1 - 630 CONTINUE - END IF -* -* Read the values for NBCOL. -* - IF( SGG ) THEN - READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) - DO 160 I = 1, NPARMS - IF( NBCOL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBCOL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX - FATAL = .TRUE. - END IF - 160 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBCOL:', - $ ( NBCOL( I ), I = 1, NPARMS ) - ELSE - DO 170 I = 1, NPARMS - NBCOL( I ) = 1 - 170 CONTINUE - END IF - END IF -* -* Calculate and print the machine dependent constants. -* - WRITE( NOUT, FMT = * ) - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9981 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9981 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9981 )'precision', EPS -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9982 )THRESH - IF( SEP .OR. SVD .OR. SGG ) THEN -* -* Read the flag that indicates whether to test LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test driver routines. -* - READ( NIN, FMT = * )TSTDRV - END IF -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* -* Read the code describing how to set the random number seed. -* - READ( NIN, FMT = * )NEWSD -* -* If NEWSD = 2, read another line with 4 integers for the seed. -* - IF( NEWSD.EQ.2 ) - $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) -* - DO 180 I = 1, 4 - ISEED( I ) = IOLDSD( I ) - 180 CONTINUE -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Read the input lines indicating the test path and its parameters. -* The first three characters indicate the test path, and the number -* of test matrix types must be the first nonblank item in columns -* 4-80. -* - 190 CONTINUE -* - IF( .NOT.( SGX .OR. SXV ) ) THEN -* - 200 CONTINUE - READ( NIN, FMT = '(A80)', END = 380 )LINE - C3 = LINE( 1: 3 ) - LENP = LEN( LINE ) - I = 3 - ITMP = 0 - I1 = 0 - 210 CONTINUE - I = I + 1 - IF( I.GT.LENP ) THEN - IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - NTYPES = MAXT - GO TO 240 - END IF - END IF - IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN - I1 = I - C1 = LINE( I1: I1 ) -* -* Check that a valid integer was read -* - DO 220 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 230 - END IF - 220 CONTINUE - WRITE( NOUT, FMT = 9991 )I, LINE - GO TO 200 - 230 CONTINUE - ITMP = 10*ITMP + IC - GO TO 210 - ELSE IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - GO TO 210 - END IF - 240 CONTINUE - NTYPES = ITMP -* -* Skip the tests if NTYPES is <= 0. -* - IF( .NOT.( SEV .OR. SES .OR. SVX .OR. SSX .OR. SGV .OR. - $ SGS ) .AND. NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - GO TO 200 - END IF -* - ELSE - IF( SXV ) - $ C3 = 'SXV' - IF( SGX ) - $ C3 = 'SGX' - END IF -* -* Reset the random number seed. -* - IF( NEWSD.EQ.0 ) THEN - DO 250 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 250 CONTINUE - END IF -* - IF( LSAMEN( 3, C3, 'SHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN -* -* ------------------------------------- -* NEP: Nonsymmetric Eigenvalue Problem -* ------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NS = number of shifts -* MAXB = minimum submatrix size -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL SERRHS( 'SHSEQR', NOUT ) - DO 270 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) - CALL XLAENV(13, INWIN( I ) ) - CALL XLAENV(14, INIBL( I ) ) - CALL XLAENV(15, ISHFTS( I ) ) - CALL XLAENV(16, IACC22( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 260 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 260 CONTINUE - END IF - WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), MAX( 11, INMIN(I)), - $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) - CALL SCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), - $ A( 1, 7 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), A( 1, 8 ), - $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ D( 1, 7 ), WORK, LWORK, IWORK, LOGWRK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKHS', INFO - 270 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'SST' ) .OR. LSAMEN( 3, C3, 'SEP' ) - $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN -* -* ---------------------------------- -* SEP: Symmetric Eigenvalue Problem -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL SERRST( 'SST', NOUT ) - DO 290 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 280 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 280 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL SCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), - $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL SCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), - $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKST', INFO - END IF - IF( TSTDRV ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL SDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11), A( 1, 2 ), NMAX, - $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL SDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), - $ D( 1, 10 ), D( 1, 11), A( 1, 2 ), NMAX, - $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRVST', INFO - END IF - 290 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'SSG' ) ) THEN -* -* ---------------------------------------------- -* SSG: Symmetric Generalized Eigenvalue Problem -* ---------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) - DO 310 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 300 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 300 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN -* CALL SDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, -* $ D( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), -* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, -* $ LWORK, IWORK, LIWORK, RESULT, INFO ) - CALL SDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ D( 1, 3 ), D( 1, 3 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), - $ A( 1, 7 ), WORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRVSG', INFO - END IF - 310 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'SBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN -* -* ---------------------------------- -* SVD: Singular Value Decomposition -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NRHS = number of right hand sides -* - MAXTYP = 16 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) -* -* Test the error exits -* - IF( TSTERR .AND. TSTCHK ) - $ CALL SERRBD( 'SBD', NOUT ) - IF( TSTERR .AND. TSTDRV ) - $ CALL SERRED( 'SBD', NOUT ) -* - DO 330 I = 1, NPARMS - NRHS = NSVAL( I ) - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - IF( NEWSD.EQ.0 ) THEN - DO 320 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 320 CONTINUE - END IF - WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), NRHS - IF( TSTCHK ) THEN - CALL SCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, - $ THRESH, A( 1, 1 ), NMAX, D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 2 ), - $ NMAX, A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), NMAX, - $ A( 1, 6 ), NMAX, A( 1, 7 ), A( 1, 8 ), WORK, - $ LWORK, IWORK, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKBD', INFO - END IF - IF( TSTDRV ) - $ CALL SDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ WORK, LWORK, IWORK, NOUT, INFO ) - 330 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'SEV' ) ) THEN -* -* -------------------------------------------- -* SEV: Nonsymmetric Eigenvalue Problem Driver -* SGEEV (eigenvalues and eigenvectors) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), - $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, RESULT, - $ WORK, LWORK, IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SGEEV', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SES' ) ) THEN -* -* -------------------------------------------- -* SES: Nonsymmetric Eigenvalue Problem Driver -* SGEES (Schur form) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ A( 1, 4 ), NMAX, RESULT, WORK, LWORK, IWORK, - $ LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SGEES', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SVX' ) ) THEN -* -* -------------------------------------------------------------- -* SVX: Nonsymmetric Eigenvalue Problem Expert Driver -* SGEEVX (eigenvalues, eigenvectors and condition numbers) -* -------------------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), - $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, - $ D( 1, 5 ), D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), - $ D( 1, 9 ), D( 1, 10 ), D( 1, 11 ), D( 1, 12 ), - $ RESULT, WORK, LWORK, IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SGEEVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SSX' ) ) THEN -* -* --------------------------------------------------- -* SSX: Nonsymmetric Eigenvalue Problem Expert Driver -* SGEESX (Schur form and condition numbers) -* --------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), A( 1, 4 ), NMAX, - $ A( 1, 5 ), RESULT, WORK, LWORK, IWORK, LOGWRK, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SGEESX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SGG' ) ) THEN -* -* ------------------------------------------------- -* SGG: Generalized Nonsymmetric Eigenvalue Problem -* ------------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NS = number of shifts -* MAXB = minimum submatrix size -* IACC22: structured matrix multiply -* NBCOL = minimum column dimension for blocks -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV(1,1) - IF( TSTCHK .AND. TSTERR ) - & CALL SERRGG( C3, NOUT ) - DO 350 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 4, NSVAL( I ) ) - CALL XLAENV( 8, MXBVAL( I ) ) - CALL XLAENV( 16, IACC22( I ) ) - CALL XLAENV( 5, NBCOL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 340 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 340 CONTINUE - END IF - WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), - $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) - TSTDIF = .FALSE. - THRSHN = 10. - IF( TSTCHK ) THEN - CALL SCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), - $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), - $ D( 1, 5 ), D( 1, 6 ), A( 1, 13 ), - $ A( 1, 14 ), WORK, LWORK, LOGWRK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKGG', INFO - END IF - 350 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'SGS' ) ) THEN -* -* ------------------------------------------------- -* SGS: Generalized Nonsymmetric Eigenvalue Problem -* SGGES (Schur form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGES', INFO -* -* Blocked version -* - CALL XLAENV(16,1) - CALL SDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGES3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( SGX ) THEN -* -* ------------------------------------------------- -* SGX: Generalized Nonsymmetric Eigenvalue Problem -* SGGESX (Schur form and condition numbers) -* ------------------------------------------------- -* - MAXTYP = 5 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 5, 2 ) - CALL SDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), - $ C( 1, 1 ), NCMAX*NCMAX, A( 1, 12 ), WORK, - $ LWORK, IWORK, LIWORK, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGSX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SGV' ) ) THEN -* -* ------------------------------------------------- -* SGV: Generalized Nonsymmetric Eigenvalue Problem -* SGGEV (Eigenvalue/vector form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), - $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), - $ WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGEV', INFO -* -* Blocked version -* - CALL SDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), - $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), - $ WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGEV3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( SXV ) THEN -* -* ------------------------------------------------- -* SXV: Generalized Nonsymmetric Eigenvalue Problem -* SGGEVX (eigenvalue/vector with condition numbers) -* ------------------------------------------------- -* - MAXTYP = 2 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL SERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL SDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), A( 1, 5 ), A( 1, 6 ), - $ IWORK( 1 ), IWORK( 2 ), D( 1, 4 ), D( 1, 5 ), - $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), - $ WORK, LWORK, IWORK( 3 ), LIWORK-2, RESULT, - $ LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SDRGVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'SSB' ) ) THEN -* -* ------------------------------ -* SSB: Symmetric Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL SERRST( 'SSB', NOUT ) -* CALL SCHKSB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), D( 1, 2 ), -* $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) - CALL SCHKSB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), - $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), - $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKSB', INFO -* - ELSE IF( LSAMEN( 3, C3, 'SBB' ) ) THEN -* -* ------------------------------ -* SBB: General Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - DO 370 I = 1, NPARMS - NRHS = NSVAL( I ) -* - IF( NEWSD.EQ.0 ) THEN - DO 360 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 360 CONTINUE - END IF - WRITE( NOUT, FMT = 9966 )C3, NRHS - CALL SCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, - $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), 2*NMAX, D( 1, 1 ), D( 1, 2 ), - $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), - $ NMAX, A( 1, 7 ), WORK, LWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCHKBB', INFO - 370 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN -* -* ----------------------------------------- -* GLM: Generalized Linear Regression Model -* ----------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL SERRGG( 'GLM', NOUT ) - CALL SCKGLM( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCKGLM', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN -* -* ------------------------------------------ -* GQR: Generalized QR and RQ factorizations -* ------------------------------------------ -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL SERRGG( 'GQR', NOUT ) - CALL SCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, - $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, D( 1, 1 ), NIN, - $ NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCKGQR', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN -* -* ---------------------------------------------- -* GSV: Generalized Singular Value Decomposition -* ---------------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL SERRGG( 'GSV', NOUT ) - CALL SCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), TAUA, TAUB, - $ B( 1, 4 ), IWORK, WORK, D( 1, 1 ), NIN, NOUT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCKGSV', INFO -* - ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN -* -* ---------------------------------------------- -* CSD: CS Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL SERRGG( 'CSD', NOUT ) - CALL SCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), IWORK, WORK, - $ D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCKCSD', INFO -* - ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN -* -* -------------------------------------- -* LSE: Constrained Linear Least Squares -* -------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL SERRGG( 'LSE', NOUT ) - CALL SCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'SCKLSE', INFO -* - ELSE - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = 9992 )C3 - END IF - IF( .NOT.( SGX .OR. SXV ) ) - $ GO TO 190 - 380 CONTINUE - WRITE( NOUT, FMT = 9994 ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) - 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, - $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) - 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', NRHS =', I4 ) - 9994 FORMAT( / / ' End of tests' ) - 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) - 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, - $ ' of input', ' line:', / A79 ) - 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) - 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', - $ I6 ) - 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', - $ I6 ) - 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) - 9986 FORMAT( ' Tests of the Symmetric Eigenvalue Problem routines' ) - 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) - 9984 FORMAT( / ' The following parameter values will be used:' ) - 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) - 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9981 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) - 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' SGEEV (eigenvalues and eigevectors)' ) - 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' SGEES (Schur form)' ) - 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' SGEEVX (eigenvalues, eigenvectors and', - $ ' condition numbers)' ) - 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' SGEESX (Schur form and condition', - $ ' numbers)' ) - 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem routines' ) - 9974 FORMAT( ' Tests of SSBTRD', / ' (reduction of a symmetric band ', - $ 'matrix to tridiagonal form)' ) - 9973 FORMAT( / 1X, 71( '-' ) ) - 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) - 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', - $ 'routines' ) - 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) - 9969 FORMAT( / ' Tests of the Generalized Singular Value', - $ ' Decomposition routines' ) - 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) - 9967 FORMAT( ' Tests of SGBBRD', / ' (reduction of a general band ', - $ 'matrix to real bidiagonal form)' ) - 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) - 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver SGGESX' ) - 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver SGGES' ) - 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver SGGEV' ) - 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver SGGEVX' ) - 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', INMIN=', I4, - $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, - $ ', IACC22 =', I4) - 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) -* -* End of SCHKEE -* - END From 23a0d1bc1fb11a48a97c5d292730c752823f41de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:47:06 +0100 Subject: [PATCH 125/681] Delete zchkee.f --- lapack-netlib/TESTING/EIG/zchkee.f | 2505 ---------------------------- 1 file changed, 2505 deletions(-) delete mode 100644 lapack-netlib/TESTING/EIG/zchkee.f diff --git a/lapack-netlib/TESTING/EIG/zchkee.f b/lapack-netlib/TESTING/EIG/zchkee.f deleted file mode 100644 index 6807ef7e4..000000000 --- a/lapack-netlib/TESTING/EIG/zchkee.f +++ /dev/null @@ -1,2505 +0,0 @@ -*> \brief \b ZCHKEE -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM ZCHKEE -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> ZCHKEE tests the COMPLEX*16 LAPACK subroutines for the matrix -*> eigenvalue problem. The test paths in this version are -*> -*> NEP (Nonsymmetric Eigenvalue Problem): -*> Test ZGEHRD, ZUNGHR, ZHSEQR, ZTREVC, ZHSEIN, and ZUNMHR -*> -*> SEP (Hermitian Eigenvalue Problem): -*> Test ZHETRD, ZUNGTR, ZSTEQR, ZSTERF, ZSTEIN, ZSTEDC, -*> and drivers ZHEEV(X), ZHBEV(X), ZHPEV(X), -*> ZHEEVD, ZHBEVD, ZHPEVD -*> -*> SVD (Singular Value Decomposition): -*> Test ZGEBRD, ZUNGBR, and ZBDSQR -*> and the drivers ZGESVD, ZGESDD -*> -*> ZEV (Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test ZGEEV -*> -*> ZES (Nonsymmetric Schur form Driver): -*> Test ZGEES -*> -*> ZVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test ZGEEVX -*> -*> ZSX (Nonsymmetric Schur form Expert Driver): -*> Test ZGEESX -*> -*> ZGG (Generalized Nonsymmetric Eigenvalue Problem): -*> Test ZGGHD3, ZGGBAL, ZGGBAK, ZHGEQZ, and ZTGEVC -*> -*> ZGS (Generalized Nonsymmetric Schur form Driver): -*> Test ZGGES -*> -*> ZGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): -*> Test ZGGEV -*> -*> ZGX (Generalized Nonsymmetric Schur form Expert Driver): -*> Test ZGGESX -*> -*> ZXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): -*> Test ZGGEVX -*> -*> ZSG (Hermitian Generalized Eigenvalue Problem): -*> Test ZHEGST, ZHEGV, ZHEGVD, ZHEGVX, ZHPGST, ZHPGV, ZHPGVD, -*> ZHPGVX, ZHBGST, ZHBGV, ZHBGVD, and ZHBGVX -*> -*> ZHB (Hermitian Band Eigenvalue Problem): -*> Test ZHBTRD -*> -*> ZBB (Band Singular Value Decomposition): -*> Test ZGBBRD -*> -*> ZEC (Eigencondition estimation): -*> Test ZTRSYL, ZTREXC, ZTRSNA, and ZTRSEN -*> -*> ZBL (Balancing a general matrix) -*> Test ZGEBAL -*> -*> ZBK (Back transformation on a balanced matrix) -*> Test ZGEBAK -*> -*> ZGL (Balancing a matrix pair) -*> Test ZGGBAL -*> -*> ZGK (Back transformation on a matrix pair) -*> Test ZGGBAK -*> -*> GLM (Generalized Linear Regression Model): -*> Tests ZGGGLM -*> -*> GQR (Generalized QR and RQ factorizations): -*> Tests ZGGQRF and ZGGRQF -*> -*> GSV (Generalized Singular Value Decomposition): -*> Tests ZGGSVD, ZGGSVP, ZTGSJA, ZLAGS2, ZLAPLL, and ZLAPMT -*> -*> CSD (CS decomposition): -*> Tests ZUNCSD -*> -*> LSE (Constrained Linear Least Squares): -*> Tests ZGGLSE -*> -*> Each test path has a different set of inputs, but the data sets for -*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a -*> single input file. The first line of input should contain one of the -*> 3-character path names in columns 1-3. The number of remaining lines -*> depends on what is found on the first line. -*> -*> The number of matrix types used in testing is often controllable from -*> the input file. The number of matrix types for each path, and the -*> test routine that describes them, is as follows: -*> -*> Path name(s) Types Test routine -*> -*> ZHS or NEP 21 ZCHKHS -*> ZST or SEP 21 ZCHKST (routines) -*> 18 ZDRVST (drivers) -*> ZBD or SVD 16 ZCHKBD (routines) -*> 5 ZDRVBD (drivers) -*> ZEV 21 ZDRVEV -*> ZES 21 ZDRVES -*> ZVX 21 ZDRVVX -*> ZSX 21 ZDRVSX -*> ZGG 26 ZCHKGG (routines) -*> ZGS 26 ZDRGES -*> ZGX 5 ZDRGSX -*> ZGV 26 ZDRGEV -*> ZXV 2 ZDRGVX -*> ZSG 21 ZDRVSG -*> ZHB 15 ZCHKHB -*> ZBB 15 ZCHKBB -*> ZEC - ZCHKEC -*> ZBL - ZCHKBL -*> ZBK - ZCHKBK -*> ZGL - ZCHKGL -*> ZGK - ZCHKGK -*> GLM 8 ZCKGLM -*> GQR 8 ZCKGQR -*> GSV 8 ZCKGSV -*> CSD 3 ZCKCSD -*> LSE 8 ZCKLSE -*> -*>----------------------------------------------------------------------- -*> -*> NEP input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NX, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: INMIN, INTEGER array, dimension (NPARMS) -*> LAHQR vs TTQRE crossover point, >= 11 -*> -*> line 9: INWIN, INTEGER array, dimension (NPARMS) -*> recommended deflation window size -*> -*> line 10: INIBL, INTEGER array, dimension (NPARMS) -*> nibble crossover point -*> -*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) -*> number of simultaneous shifts) -*> -*> line 12: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 0, 1 or 2) -*> -*> line 13: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. To have all of the test -*> ratios printed, use THRESH = 0.0 . -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow -*> the user to specify the matrix types. Each line contains -*> a 3-character path name in columns 1-3, and the number -*> of matrix types must be the first nonblank item in columns -*> 4-80. If the number of matrix types is at least 1 but is -*> less than the maximum number of possible types, a second -*> line will be read to get the numbers of the matrix types to -*> be used. For example, -*> NEP 21 -*> requests all of the matrix types for the nonsymmetric -*> eigenvalue problem, while -*> NEP 4 -*> 9 10 11 12 -*> requests only matrices of type 9, 10, 11, and 12. -*> -*> The valid 3-character path names are 'NEP' or 'ZHS' for the -*> nonsymmetric eigenvalue routines. -*> -*>----------------------------------------------------------------------- -*> -*> SEP or ZSG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, and NX. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 7: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 10: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 11: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 12: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 12 was 2: -*> -*> line 13: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 13-EOF: Lines specifying matrix types, as for NEP. -*> The valid 3-character path names are 'SEP' or 'ZST' for the -*> Hermitian eigenvalue routines and driver routines, and -*> 'ZSG' for the routines for the Hermitian generalized -*> eigenvalue problem. -*> -*>----------------------------------------------------------------------- -*> -*> SVD input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 5: NPARMS, INTEGER -*> Number of values of the parameter NB, NBMIN, NX, and NRHS. -*> -*> line 6: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 7: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for the minimum blocksize NBMIN. -*> -*> line 8: NXVAL, INTEGER array, dimension (NPARMS) -*> The values for the crossover point NX. -*> -*> line 9: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 10: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 11: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 12: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 13: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 14: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 14 was 2: -*> -*> line 15: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 15-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path names are 'SVD' or 'ZBD' for both the -*> SVD routines and the SVD driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> ZEV and ZES data files: -*> -*> line 1: 'ZEV' or 'ZES' in columns 1 to 3. -*> -*> line 2: NSIZES, INTEGER -*> Number of sizes of matrices to use. Should be at least 0 -*> and at most 20. If NSIZES = 0, no testing is done -*> (although the remaining 3 lines are still read). -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHSEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8 and following: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'ZEV' to test CGEEV, or -*> 'ZES' to test CGEES. -*> -*>----------------------------------------------------------------------- -*> -*> The ZVX data has two parts. The first part is identical to ZEV, -*> and the second part consists of test matrices with precomputed -*> solutions. -*> -*> line 1: 'ZVX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: NEWSD, INTEGER -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> -*> lines 8 and following: The first line contains 'ZVX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 1+N+N**2 lines, where N is -*> its dimension. The first line contains the dimension N and -*> ISRT (two integers). ISRT indicates whether the last N lines -*> are sorted by increasing real part of the eigenvalue -*> (ISRT=0) or by increasing imaginary part (ISRT=1). The next -*> N**2 lines contain the matrix rowwise, one entry per line. -*> The last N lines correspond to each eigenvalue. Each of -*> these last N lines contains 4 real values: the real part of -*> the eigenvalues, the imaginary part of the eigenvalue, the -*> reciprocal condition number of the eigenvalues, and the -*> reciprocal condition number of the vector eigenvector. The -*> end of data is indicated by dimension N=0. Even if no data -*> is to be tested, there must be at least one line containing -*> N=0. -*> -*>----------------------------------------------------------------------- -*> -*> The ZSX data is like ZVX. The first part is identical to ZEV, and the -*> second part consists of test matrices with precomputed solutions. -*> -*> line 1: 'ZSX' in columns 1-3. -*> -*> line 2: NSIZES, INTEGER -*> If NSIZES = 0, no testing of randomly generated examples -*> is done, but any precomputed examples are tested. -*> -*> line 3: NN, INTEGER array, dimension(NSIZES) -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> -*> line 5: THRESH, REAL -*> -*> line 6: NEWSD, INTEGER -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> -*> lines 8 and following: The first line contains 'ZSX' in columns 1-3 -*> followed by the number of matrix types, possibly with -*> a second line to specify certain matrix types. -*> If the number of matrix types = 0, no testing of randomly -*> generated examples is done, but any precomputed examples -*> are tested. -*> -*> remaining lines : Each matrix is stored on 3+N**2 lines, where N is -*> its dimension. The first line contains the dimension N, the -*> dimension M of an invariant subspace, and ISRT. The second -*> line contains M integers, identifying the eigenvalues in the -*> invariant subspace (by their position in a list of -*> eigenvalues ordered by increasing real part (if ISRT=0) or -*> by increasing imaginary part (if ISRT=1)). The next N**2 -*> lines contain the matrix rowwise. The last line contains the -*> reciprocal condition number for the average of the selected -*> eigenvalues, and the reciprocal condition number for the -*> corresponding right invariant subspace. The end of data in -*> indicated by a line containing N=0, M=0, and ISRT = 0. Even -*> if no data is to be tested, there must be at least one line -*> containing N=0, M=0 and ISRT=0. -*> -*>----------------------------------------------------------------------- -*> -*> ZGG input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NPARMS, INTEGER -*> Number of values of the parameters NB, NBMIN, NBCOL, NS, and -*> MAXB. -*> -*> line 5: NBVAL, INTEGER array, dimension (NPARMS) -*> The values for the blocksize NB. -*> -*> line 6: NBMIN, INTEGER array, dimension (NPARMS) -*> The values for NBMIN, the minimum row dimension for blocks. -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of shifts. -*> -*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) -*> The values for MAXB, used in determining minimum blocksize. -*> -*> line 9: IACC22, INTEGER array, dimension (NPARMS) -*> select structured matrix multiply: 1 or 2) -*> -*> line 10: NBCOL, INTEGER array, dimension (NPARMS) -*> The values for NBCOL, the minimum column dimension for -*> blocks. -*> -*> line 11: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 12: TSTCHK, LOGICAL -*> Flag indicating whether or not to test the LAPACK routines. -*> -*> line 13: TSTDRV, LOGICAL -*> Flag indicating whether or not to test the driver routines. -*> -*> line 14: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 15: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 15 was 2: -*> -*> line 16: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 17-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'ZGG' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> ZGS and ZGV input files: -*> -*> line 1: 'ZGS' or 'ZGV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension(NN) -*> Dimensions of matrices to be tested. -*> -*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 5: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> If it is 0., all test case data will be printed. -*> -*> line 6: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 17 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 7-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'ZGS' for the generalized -*> eigenvalue problem routines and driver routines. -*> -*>----------------------------------------------------------------------- -*> -*> ZGX input file: -*> line 1: 'ZGX' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next line contains an integer k such -*> that only the last k eigenvalues will be selected and appear -*> in the leading diagonal blocks of $A$ and $B$. The next N*N -*> lines contain the matrix A, one element per line. The next N*N -*> lines contain the matrix B. The last line contains the -*> reciprocal of the eigenvalue cluster condition number and the -*> reciprocal of the deflating subspace (associated with the -*> selected eigencluster) condition number. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> ZXV input files: -*> line 1: 'ZXV' in columns 1 to 3. -*> -*> line 2: N, INTEGER -*> Value of N. -*> -*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs -*> These integer parameters determine how blocking is done -*> (see ILAENV for details) -*> NB : block size -*> NBMIN : minimum block size -*> NX : minimum dimension for blocking -*> NS : number of shifts in xHGEQR -*> NBCOL : minimum column dimension for blocking -*> -*> line 4: THRESH, REAL -*> The test threshold against which computed residuals are -*> compared. Should generally be in the range from 10. to 20. -*> Information will be printed about each test for which the -*> test ratio is greater than or equal to the threshold. -*> -*> line 5: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 6: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 6 was 2: -*> -*> line 7: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> If line 2 was 0: -*> -*> line 7-EOF: Precomputed examples are tested. -*> -*> remaining lines : Each example is stored on 3+2*N*N lines, where N is -*> its dimension. The first line contains the dimension (a -*> single integer). The next N*N lines contain the matrix A, one -*> element per line. The next N*N lines contain the matrix B. -*> The next line contains the reciprocals of the eigenvalue -*> condition numbers. The last line contains the reciprocals of -*> the eigenvector condition numbers. The end of data is -*> indicated by dimension N=0. Even if no data is to be tested, -*> there must be at least one line containing N=0. -*> -*>----------------------------------------------------------------------- -*> -*> ZHB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of N. -*> -*> line 3: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix dimension K. -*> -*> line 6: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 7 was 2: -*> -*> line 8: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 8-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'ZHB'. -*> -*>----------------------------------------------------------------------- -*> -*> ZBB input file: -*> -*> line 2: NN, INTEGER -*> Number of values of M and N. -*> -*> line 3: MVAL, INTEGER array, dimension (NN) -*> The values for the matrix row dimension M. -*> -*> line 4: NVAL, INTEGER array, dimension (NN) -*> The values for the matrix column dimension N. -*> -*> line 4: NK, INTEGER -*> Number of values of K. -*> -*> line 5: KVAL, INTEGER array, dimension (NK) -*> The values for the matrix bandwidth K. -*> -*> line 6: NPARMS, INTEGER -*> Number of values of the parameter NRHS -*> -*> line 7: NSVAL, INTEGER array, dimension (NPARMS) -*> The values for the number of right hand sides NRHS. -*> -*> line 8: THRESH -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 9: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 9 was 2: -*> -*> line 10: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 10-EOF: Lines specifying matrix types, as for SVD. -*> The 3-character path name is 'ZBB'. -*> -*>----------------------------------------------------------------------- -*> -*> ZEC input file: -*> -*> line 2: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> lines 3-EOF: -*> -*> Input for testing the eigencondition routines consists of a set of -*> specially constructed test cases and their solutions. The data -*> format is not intended to be modified by the user. -*> -*>----------------------------------------------------------------------- -*> -*> ZBL and ZBK input files: -*> -*> line 1: 'ZBL' in columns 1-3 to test CGEBAL, or 'ZBK' in -*> columns 1-3 to test CGEBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> ZGL and ZGK input files: -*> -*> line 1: 'ZGL' in columns 1-3 to test ZGGBAL, or 'ZGK' in -*> columns 1-3 to test ZGGBAK. -*> -*> The remaining lines consist of specially constructed test cases. -*> -*>----------------------------------------------------------------------- -*> -*> GLM data file: -*> -*> line 1: 'GLM' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension), note M <= N <= M+P. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GLM' for the generalized -*> linear regression model routines. -*> -*>----------------------------------------------------------------------- -*> -*> GQR data file: -*> -*> line 1: 'GQR' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GQR' for the generalized -*> QR and RQ routines. -*> -*>----------------------------------------------------------------------- -*> -*> GSV data file: -*> -*> line 1: 'GSV' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M (row dimension). -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P (row dimension). -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N (column dimension). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> CSD data file: -*> -*> line 1: 'CSD' in columns 1 to 3. -*> -*> line 2: NM, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NM) -*> Values of M (row and column dimension of orthogonal matrix). -*> -*> line 4: PVAL, INTEGER array, dimension(NM) -*> Values of P (row dimension of top-left block). -*> -*> line 5: NVAL, INTEGER array, dimension(NM) -*> Values of N (column dimension of top-left block). -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'CSD' for the CSD routine. -*> -*>----------------------------------------------------------------------- -*> -*> LSE data file: -*> -*> line 1: 'LSE' in columns 1 to 3. -*> -*> line 2: NN, INTEGER -*> Number of values of M, P, and N. -*> -*> line 3: MVAL, INTEGER array, dimension(NN) -*> Values of M. -*> -*> line 4: PVAL, INTEGER array, dimension(NN) -*> Values of P. -*> -*> line 5: NVAL, INTEGER array, dimension(NN) -*> Values of N, note P <= N <= P+M. -*> -*> line 6: THRESH, REAL -*> Threshold value for the test ratios. Information will be -*> printed about each test for which the test ratio is greater -*> than or equal to the threshold. -*> -*> line 7: TSTERR, LOGICAL -*> Flag indicating whether or not to test the error exits for -*> the LAPACK routines and driver routines. -*> -*> line 8: NEWSD, INTEGER -*> A code indicating how to set the random number seed. -*> = 0: Set the seed to a default value before each run -*> = 1: Initialize the seed to a default value only before the -*> first run -*> = 2: Like 1, but use the seed values on the next line -*> -*> If line 8 was 2: -*> -*> line 9: INTEGER array, dimension (4) -*> Four integer values for the random number seed. -*> -*> lines 9-EOF: Lines specifying matrix types, as for NEP. -*> The 3-character path name is 'GSV' for the generalized -*> SVD routines. -*> -*>----------------------------------------------------------------------- -*> -*> NMAX is currently set to 132 and must be at least 12 for some of the -*> precomputed examples, and LWORK = NMAX*(5*NMAX+20) in the parameter -*> statements below. For SVD, we assume NRHS may be as big as N. The -*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for ZGG. -*> \endverbatim -* -* Arguments: -* ========== -* -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date June 2016 -* -*> \ingroup complex16_eig -* -* ===================================================================== - PROGRAM ZCHKEE -* -* -- LAPACK test routine (version 3.7.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2016 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER NCMAX - PARAMETER ( NCMAX = 20 ) - INTEGER NEED - PARAMETER ( NEED = 14 ) - INTEGER LWORK - PARAMETER ( LWORK = NMAX*( 5*NMAX+20 ) ) - INTEGER LIWORK - PARAMETER ( LIWORK = NMAX*( NMAX+20 ) ) - INTEGER MAXIN - PARAMETER ( MAXIN = 20 ) - INTEGER MAXT - PARAMETER ( MAXT = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) -* .. -* .. Local Scalars .. - LOGICAL ZBK, ZBL, ZES, ZEV, ZGK, ZGL, ZGS, ZGV, ZGX, - $ ZSX, ZVX, ZXV, CSD, FATAL, GLM, GQR, GSV, LSE, - $ NEP, SEP, SVD, TSTCHK, TSTDIF, TSTDRV, TSTERR, - $ ZBB, ZGG, ZHB - CHARACTER C1 - CHARACTER*3 C3, PATH - CHARACTER*32 VNAME - CHARACTER*10 INTSTR - CHARACTER*80 LINE - INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, - $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) - INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), - $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), - $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ PVAL( MAXIN ) - INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), - $ ISHFTS( MAXIN ), IACC22( MAXIN ) - DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), - $ X( 5*NMAX ) -* .. -* .. External Functions .. - LOGICAL LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, XLAENV, ZCHKBB, ZCHKBD, ZCHKBK, ZCHKBL, - $ ZCHKEC, ZCHKGG, ZCHKGK, ZCHKGL, ZCHKHB, ZCHKHS, - $ ZCHKST, ZCKCSD, ZCKGLM, ZCKGQR, ZCKGSV, ZCKLSE, - $ ZDRGES, ZDRGEV, ZDRGSX, ZDRGVX, ZDRVBD, ZDRVES, - $ ZDRVEV, ZDRVSG, ZDRVST, ZDRVSX, ZDRVVX, - $ ZERRBD, ZERRED, ZERRGG, ZERRHS, ZERRST, ILAVER, - $ ZDRGES3, ZDRGEV3, - $ ZCHKST2STG, ZDRVST2STG, ZCHKHB2STG -* .. -* .. Intrinsic Functions .. - INTRINSIC LEN, MIN -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, - $ SELOPT -* .. -* .. Arrays in Common .. - LOGICAL SELVAL( 20 ) - INTEGER IPARMS( 100 ) - DOUBLE PRECISION SELWI( 20 ), SELWR( 20 ) -* .. -* .. Common blocks .. - COMMON / CENVIR / NPROC, NSHIFT, MAXB - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / -* .. -* .. Executable Statements .. -* - A = 0.0 - B = 0.0 - C = 0.0 - DC = 0.0 - S1 = DSECND( ) - FATAL = .FALSE. - NUNIT = NOUT -* -* Return to here to read multiple sets of data -* - 10 CONTINUE -* -* Read the first line and set the 3-character test path -* - READ( NIN, FMT = '(A80)', END = 380 )LINE - PATH = LINE( 1: 3 ) - NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'ZHS' ) - SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'ZST' ) .OR. - $ LSAMEN( 3, PATH, 'ZSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) - SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'ZBD' ) - ZEV = LSAMEN( 3, PATH, 'ZEV' ) - ZES = LSAMEN( 3, PATH, 'ZES' ) - ZVX = LSAMEN( 3, PATH, 'ZVX' ) - ZSX = LSAMEN( 3, PATH, 'ZSX' ) - ZGG = LSAMEN( 3, PATH, 'ZGG' ) - ZGS = LSAMEN( 3, PATH, 'ZGS' ) - ZGX = LSAMEN( 3, PATH, 'ZGX' ) - ZGV = LSAMEN( 3, PATH, 'ZGV' ) - ZXV = LSAMEN( 3, PATH, 'ZXV' ) - ZHB = LSAMEN( 3, PATH, 'ZHB' ) - ZBB = LSAMEN( 3, PATH, 'ZBB' ) - GLM = LSAMEN( 3, PATH, 'GLM' ) - GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) - GSV = LSAMEN( 3, PATH, 'GSV' ) - CSD = LSAMEN( 3, PATH, 'CSD' ) - LSE = LSAMEN( 3, PATH, 'LSE' ) - ZBL = LSAMEN( 3, PATH, 'ZBL' ) - ZBK = LSAMEN( 3, PATH, 'ZBK' ) - ZGL = LSAMEN( 3, PATH, 'ZGL' ) - ZGK = LSAMEN( 3, PATH, 'ZGK' ) -* -* Report values of parameters. -* - IF( PATH.EQ.' ' ) THEN - GO TO 10 - ELSE IF( NEP ) THEN - WRITE( NOUT, FMT = 9987 ) - ELSE IF( SEP ) THEN - WRITE( NOUT, FMT = 9986 ) - ELSE IF( SVD ) THEN - WRITE( NOUT, FMT = 9985 ) - ELSE IF( ZEV ) THEN - WRITE( NOUT, FMT = 9979 ) - ELSE IF( ZES ) THEN - WRITE( NOUT, FMT = 9978 ) - ELSE IF( ZVX ) THEN - WRITE( NOUT, FMT = 9977 ) - ELSE IF( ZSX ) THEN - WRITE( NOUT, FMT = 9976 ) - ELSE IF( ZGG ) THEN - WRITE( NOUT, FMT = 9975 ) - ELSE IF( ZGS ) THEN - WRITE( NOUT, FMT = 9964 ) - ELSE IF( ZGX ) THEN - WRITE( NOUT, FMT = 9965 ) - ELSE IF( ZGV ) THEN - WRITE( NOUT, FMT = 9963 ) - ELSE IF( ZXV ) THEN - WRITE( NOUT, FMT = 9962 ) - ELSE IF( ZHB ) THEN - WRITE( NOUT, FMT = 9974 ) - ELSE IF( ZBB ) THEN - WRITE( NOUT, FMT = 9967 ) - ELSE IF( GLM ) THEN - WRITE( NOUT, FMT = 9971 ) - ELSE IF( GQR ) THEN - WRITE( NOUT, FMT = 9970 ) - ELSE IF( GSV ) THEN - WRITE( NOUT, FMT = 9969 ) - ELSE IF( CSD ) THEN - WRITE( NOUT, FMT = 9960 ) - ELSE IF( LSE ) THEN - WRITE( NOUT, FMT = 9968 ) - ELSE IF( ZBL ) THEN -* -* ZGEBAL: Balancing -* - CALL ZCHKBL( NIN, NOUT ) - GO TO 380 - ELSE IF( ZBK ) THEN -* -* ZGEBAK: Back transformation -* - CALL ZCHKBK( NIN, NOUT ) - GO TO 380 - ELSE IF( ZGL ) THEN -* -* ZGGBAL: Balancing -* - CALL ZCHKGL( NIN, NOUT ) - GO TO 380 - ELSE IF( ZGK ) THEN -* -* ZGGBAK: Back transformation -* - CALL ZCHKGK( NIN, NOUT ) - GO TO 380 - ELSE IF( LSAMEN( 3, PATH, 'ZEC' ) ) THEN -* -* ZEC: Eigencondition estimation -* - READ( NIN, FMT = * )THRESH - CALL XLAENV( 1, 1 ) - CALL XLAENV( 12, 1 ) - TSTERR = .TRUE. - CALL ZCHKEC( THRESH, TSTERR, NIN, NOUT ) - GO TO 380 - ELSE - WRITE( NOUT, FMT = 9992 )PATH - GO TO 380 - END IF - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH - WRITE( NOUT, FMT = 9984 ) -* -* Read the number of values of M, P, and N. -* - READ( NIN, FMT = * )NN - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF -* -* Read the values of M -* - IF( .NOT.( ZGX .OR. ZXV ) ) THEN - READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) - IF( SVD ) THEN - VNAME = ' M ' - ELSE - VNAME = ' N ' - END IF - DO 20 I = 1, NN - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) - END IF -* -* Read the values of P -* - IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN - READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) - DO 30 I = 1, NN - IF( PVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( PVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX - FATAL = .TRUE. - END IF - 30 CONTINUE - WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) - END IF -* -* Read the values of N -* - IF( SVD .OR. ZBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. - $ LSE ) THEN - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 40 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 40 CONTINUE - ELSE - DO 50 I = 1, NN - NVAL( I ) = MVAL( I ) - 50 CONTINUE - END IF - IF( .NOT.( ZGX .OR. ZXV ) ) THEN - WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) - ELSE - WRITE( NOUT, FMT = 9983 )'N: ', NN - END IF -* -* Read the number of values of K, followed by the values of K -* - IF( ZHB .OR. ZBB ) THEN - READ( NIN, FMT = * )NK - READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) - DO 60 I = 1, NK - IF( KVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( KVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX - FATAL = .TRUE. - END IF - 60 CONTINUE - WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) - END IF -* - IF( ZEV .OR. ZES .OR. ZVX .OR. ZSX ) THEN -* -* For the nonsymmetric QR driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INWIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( INIBL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( ISHFTS( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 - FATAL = .TRUE. - ELSE IF( IACC22( 1 ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) - CALL XLAENV(13, INWIN( 1 ) ) - CALL XLAENV(14, INIBL( 1 ) ) - CALL XLAENV(15, ISHFTS( 1 ) ) - CALL XLAENV(16, IACC22( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) - WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) - WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) -* - ELSE IF( ZGS .OR. ZGX .OR. ZGV .OR. ZXV ) THEN -* -* For the nonsymmetric generalized driver routines, only one set of -* parameters is allowed. -* - READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), - $ NSVAL( 1 ), MXBVAL( 1 ) - IF( NBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NBMIN( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NXVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 - FATAL = .TRUE. - ELSE IF( NSVAL( 1 ).LT.2 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 - FATAL = .TRUE. - ELSE IF( MXBVAL( 1 ).LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 - FATAL = .TRUE. - END IF - CALL XLAENV( 1, NBVAL( 1 ) ) - CALL XLAENV( 2, NBMIN( 1 ) ) - CALL XLAENV( 3, NXVAL( 1 ) ) - CALL XLAENV( 4, NSVAL( 1 ) ) - CALL XLAENV( 8, MXBVAL( 1 ) ) - WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) - WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) - WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) - ELSE IF( .NOT.ZHB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. - $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN -* -* For the other paths, the number of parameters can be varied -* from the input file. Read the number of parameter values. -* - READ( NIN, FMT = * )NPARMS - IF( NPARMS.LT.1 ) THEN - WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 - NPARMS = 0 - FATAL = .TRUE. - ELSE IF( NPARMS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN - NPARMS = 0 - FATAL = .TRUE. - END IF -* -* Read the values of NB -* - IF( .NOT.ZBB ) THEN - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) - DO 70 I = 1, NPARMS - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 70 CONTINUE - WRITE( NOUT, FMT = 9983 )'NB: ', - $ ( NBVAL( I ), I = 1, NPARMS ) - END IF -* -* Read the values of NBMIN -* - IF( NEP .OR. SEP .OR. SVD .OR. ZGG ) THEN - READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) - DO 80 I = 1, NPARMS - IF( NBMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 - FATAL = .TRUE. - ELSE IF( NBMIN( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX - FATAL = .TRUE. - END IF - 80 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBMIN:', - $ ( NBMIN( I ), I = 1, NPARMS ) - ELSE - DO 90 I = 1, NPARMS - NBMIN( I ) = 1 - 90 CONTINUE - END IF -* -* Read the values of NX -* - IF( NEP .OR. SEP .OR. SVD ) THEN - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) - DO 100 I = 1, NPARMS - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NXVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX - FATAL = .TRUE. - END IF - 100 CONTINUE - WRITE( NOUT, FMT = 9983 )'NX: ', - $ ( NXVAL( I ), I = 1, NPARMS ) - ELSE - DO 110 I = 1, NPARMS - NXVAL( I ) = 1 - 110 CONTINUE - END IF -* -* Read the values of NSHIFT (if ZGG) or NRHS (if SVD -* or ZBB). -* - IF( SVD .OR. ZBB .OR. ZGG ) THEN - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) - DO 120 I = 1, NPARMS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX - FATAL = .TRUE. - END IF - 120 CONTINUE - WRITE( NOUT, FMT = 9983 )'NS: ', - $ ( NSVAL( I ), I = 1, NPARMS ) - ELSE - DO 130 I = 1, NPARMS - NSVAL( I ) = 1 - 130 CONTINUE - END IF -* -* Read the values for MAXB. -* - IF( ZGG ) THEN - READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) - DO 140 I = 1, NPARMS - IF( MXBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MXBVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX - FATAL = .TRUE. - END IF - 140 CONTINUE - WRITE( NOUT, FMT = 9983 )'MAXB: ', - $ ( MXBVAL( I ), I = 1, NPARMS ) - ELSE - DO 150 I = 1, NPARMS - MXBVAL( I ) = 1 - 150 CONTINUE - END IF -* -* Read the values for INMIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) - DO 540 I = 1, NPARMS - IF( INMIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 - FATAL = .TRUE. - END IF - 540 CONTINUE - WRITE( NOUT, FMT = 9983 )'INMIN: ', - $ ( INMIN( I ), I = 1, NPARMS ) - ELSE - DO 550 I = 1, NPARMS - INMIN( I ) = 1 - 550 CONTINUE - END IF -* -* Read the values for INWIN. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) - DO 560 I = 1, NPARMS - IF( INWIN( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 - FATAL = .TRUE. - END IF - 560 CONTINUE - WRITE( NOUT, FMT = 9983 )'INWIN: ', - $ ( INWIN( I ), I = 1, NPARMS ) - ELSE - DO 570 I = 1, NPARMS - INWIN( I ) = 1 - 570 CONTINUE - END IF -* -* Read the values for INIBL. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) - DO 580 I = 1, NPARMS - IF( INIBL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 - FATAL = .TRUE. - END IF - 580 CONTINUE - WRITE( NOUT, FMT = 9983 )'INIBL: ', - $ ( INIBL( I ), I = 1, NPARMS ) - ELSE - DO 590 I = 1, NPARMS - INIBL( I ) = 1 - 590 CONTINUE - END IF -* -* Read the values for ISHFTS. -* - IF( NEP ) THEN - READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) - DO 600 I = 1, NPARMS - IF( ISHFTS( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 - FATAL = .TRUE. - END IF - 600 CONTINUE - WRITE( NOUT, FMT = 9983 )'ISHFTS: ', - $ ( ISHFTS( I ), I = 1, NPARMS ) - ELSE - DO 610 I = 1, NPARMS - ISHFTS( I ) = 1 - 610 CONTINUE - END IF -* -* Read the values for IACC22. -* - IF( NEP .OR. ZGG ) THEN - READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) - DO 620 I = 1, NPARMS - IF( IACC22( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 - FATAL = .TRUE. - END IF - 620 CONTINUE - WRITE( NOUT, FMT = 9983 )'IACC22: ', - $ ( IACC22( I ), I = 1, NPARMS ) - ELSE - DO 630 I = 1, NPARMS - IACC22( I ) = 1 - 630 CONTINUE - END IF -* -* Read the values for NBCOL. -* - IF( ZGG ) THEN - READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) - DO 160 I = 1, NPARMS - IF( NBCOL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 - FATAL = .TRUE. - ELSE IF( NBCOL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX - FATAL = .TRUE. - END IF - 160 CONTINUE - WRITE( NOUT, FMT = 9983 )'NBCOL:', - $ ( NBCOL( I ), I = 1, NPARMS ) - ELSE - DO 170 I = 1, NPARMS - NBCOL( I ) = 1 - 170 CONTINUE - END IF - END IF -* -* Calculate and print the machine dependent constants. -* - WRITE( NOUT, FMT = * ) - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9981 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9981 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9981 )'precision', EPS -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9982 )THRESH - IF( SEP .OR. SVD .OR. ZGG ) THEN -* -* Read the flag that indicates whether to test LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test driver routines. -* - READ( NIN, FMT = * )TSTDRV - END IF -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* -* Read the code describing how to set the random number seed. -* - READ( NIN, FMT = * )NEWSD -* -* If NEWSD = 2, read another line with 4 integers for the seed. -* - IF( NEWSD.EQ.2 ) - $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) -* - DO 180 I = 1, 4 - ISEED( I ) = IOLDSD( I ) - 180 CONTINUE -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Read the input lines indicating the test path and its parameters. -* The first three characters indicate the test path, and the number -* of test matrix types must be the first nonblank item in columns -* 4-80. -* - 190 CONTINUE -* - IF( .NOT.( ZGX .OR. ZXV ) ) THEN -* - 200 CONTINUE - READ( NIN, FMT = '(A80)', END = 380 )LINE - C3 = LINE( 1: 3 ) - LENP = LEN( LINE ) - I = 3 - ITMP = 0 - I1 = 0 - 210 CONTINUE - I = I + 1 - IF( I.GT.LENP ) THEN - IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - NTYPES = MAXT - GO TO 240 - END IF - END IF - IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN - I1 = I - C1 = LINE( I1: I1 ) -* -* Check that a valid integer was read -* - DO 220 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 230 - END IF - 220 CONTINUE - WRITE( NOUT, FMT = 9991 )I, LINE - GO TO 200 - 230 CONTINUE - ITMP = 10*ITMP + IC - GO TO 210 - ELSE IF( I1.GT.0 ) THEN - GO TO 240 - ELSE - GO TO 210 - END IF - 240 CONTINUE - NTYPES = ITMP -* -* Skip the tests if NTYPES is <= 0. -* - IF( .NOT.( ZEV .OR. ZES .OR. ZVX .OR. ZSX .OR. ZGV .OR. - $ ZGS ) .AND. NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - GO TO 200 - END IF -* - ELSE - IF( ZGX ) - $ C3 = 'ZGX' - IF( ZXV ) - $ C3 = 'ZXV' - END IF -* -* Reset the random number seed. -* - IF( NEWSD.EQ.0 ) THEN - DO 250 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 250 CONTINUE - END IF -* - IF( LSAMEN( 3, C3, 'ZHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN -* -* ------------------------------------- -* NEP: Nonsymmetric Eigenvalue Problem -* ------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NS = number of shifts -* MAXB = minimum submatrix size -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL ZERRHS( 'ZHSEQR', NOUT ) - DO 270 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) - CALL XLAENV(13, INWIN( I ) ) - CALL XLAENV(14, INIBL( I ) ) - CALL XLAENV(15, ISHFTS( I ) ) - CALL XLAENV(16, IACC22( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 260 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 260 CONTINUE - END IF - WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), MAX( 11, INMIN(I)), - $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) - CALL ZCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), - $ A( 1, 7 ), DC( 1, 1 ), DC( 1, 2 ), A( 1, 8 ), - $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ DC( 1, 3 ), WORK, LWORK, RWORK, IWORK, LOGWRK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKHS', INFO - 270 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'ZST' ) .OR. LSAMEN( 3, C3, 'SEP' ) - $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN -* -* ---------------------------------- -* SEP: Symmetric Eigenvalue Problem -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 1, 1 ) - CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL ZERRST( 'ZST', NOUT ) - DO 290 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 280 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 280 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL ZCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), - $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), - $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - ELSE - CALL ZCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), - $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), - $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, - $ RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKST', INFO - END IF - IF( TSTDRV ) THEN - IF( LSAMEN( 3, C3, 'SE2' ) ) THEN - CALL ZDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), - $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), - $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ELSE - CALL ZDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), - $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), - $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), - $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, - $ LWORK, IWORK, LIWORK, RESULT, INFO ) - ENDIF - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRVST', INFO - END IF - 290 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'ZSG' ) ) THEN -* -* ---------------------------------------------- -* ZSG: Hermitian Generalized Eigenvalue Problem -* ---------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) - DO 310 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 300 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 300 CONTINUE - END IF - WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ) - IF( TSTCHK ) THEN -* CALL ZDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, -* $ DR( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), -* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, -* $ LWORK, RWORK, LWORK, IWORK, LIWORK, RESULT, -* $ INFO ) - CALL ZDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ DR( 1, 3 ), DR( 1, 4 ), A( 1, 3 ), NMAX, - $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), - $ A( 1, 7 ), WORK, LWORK, RWORK, LWORK, - $ IWORK, LIWORK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRVSG', INFO - END IF - 310 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'ZBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN -* -* ---------------------------------- -* SVD: Singular Value Decomposition -* ---------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NX = crossover point -* NRHS = number of right hand sides -* - MAXTYP = 16 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 9, 25 ) -* -* Test the error exits -* - CALL XLAENV( 1, 1 ) - IF( TSTERR .AND. TSTCHK ) - $ CALL ZERRBD( 'ZBD', NOUT ) - IF( TSTERR .AND. TSTDRV ) - $ CALL ZERRED( 'ZBD', NOUT ) -* - DO 330 I = 1, NPARMS - NRHS = NSVAL( I ) - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 3, NXVAL( I ) ) - IF( NEWSD.EQ.0 ) THEN - DO 320 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 320 CONTINUE - END IF - WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), - $ NXVAL( I ), NRHS - IF( TSTCHK ) THEN - CALL ZCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, - $ THRESH, A( 1, 1 ), NMAX, DR( 1, 1 ), - $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), - $ A( 1, 2 ), NMAX, A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), NMAX, A( 1, 6 ), NMAX, A( 1, 7 ), - $ A( 1, 8 ), WORK, LWORK, RWORK, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKBD', INFO - END IF - IF( TSTDRV ) - $ CALL ZDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, - $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), DR( 1, 1 ), DR( 1, 2 ), - $ DR( 1, 3 ), WORK, LWORK, RWORK, IWORK, NOUT, - $ INFO ) - 330 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'ZEV' ) ) THEN -* -* -------------------------------------------- -* ZEV: Nonsymmetric Eigenvalue Problem Driver -* ZGEEV (eigenvalues and eigenvectors) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, - $ A( 1, 5 ), NMAX, RESULT, WORK, LWORK, RWORK, - $ IWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZGEEV', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZES' ) ) THEN -* -* -------------------------------------------- -* ZES: Nonsymmetric Eigenvalue Problem Driver -* ZGEES (Schur form) -* -------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ DC( 1, 1 ), DC( 1, 2 ), A( 1, 4 ), NMAX, - $ RESULT, WORK, LWORK, RWORK, IWORK, LOGWRK, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZGEES', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZVX' ) ) THEN -* -* -------------------------------------------------------------- -* ZVX: Nonsymmetric Eigenvalue Problem Expert Driver -* ZGEEVX (eigenvalues, eigenvectors and condition numbers) -* -------------------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, - $ A( 1, 5 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), - $ DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), - $ DR( 1, 7 ), DR( 1, 8 ), RESULT, WORK, LWORK, - $ RWORK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZGEEVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZSX' ) ) THEN -* -* --------------------------------------------------- -* ZSX: Nonsymmetric Eigenvalue Problem Expert Driver -* ZGEESX (Schur form and condition numbers) -* --------------------------------------------------- -* - MAXTYP = 21 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRED( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, - $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), A( 1, 4 ), - $ NMAX, A( 1, 5 ), RESULT, WORK, LWORK, RWORK, - $ LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZGEESX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZGG' ) ) THEN -* -* ------------------------------------------------- -* ZGG: Generalized Nonsymmetric Eigenvalue Problem -* ------------------------------------------------- -* Vary the parameters -* NB = block size -* NBMIN = minimum block size -* NS = number of shifts -* MAXB = minimum submatrix size -* IACC22: structured matrix multiply -* NBCOL = minimum column dimension for blocks -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV(1,1) - IF( TSTCHK .AND. TSTERR ) - $ CALL ZERRGG( C3, NOUT ) - DO 350 I = 1, NPARMS - CALL XLAENV( 1, NBVAL( I ) ) - CALL XLAENV( 2, NBMIN( I ) ) - CALL XLAENV( 4, NSVAL( I ) ) - CALL XLAENV( 8, MXBVAL( I ) ) - CALL XLAENV( 16, IACC22( I ) ) - CALL XLAENV( 5, NBCOL( I ) ) -* - IF( NEWSD.EQ.0 ) THEN - DO 340 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 340 CONTINUE - END IF - WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), - $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) - TSTDIF = .FALSE. - THRSHN = 10.D0 - IF( TSTCHK ) THEN - CALL ZCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, - $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), - $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), - $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), - $ DC( 1, 4 ), A( 1, 13 ), A( 1, 14 ), WORK, - $ LWORK, RWORK, LOGWRK, RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKGG', INFO - END IF - 350 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'ZGS' ) ) THEN -* -* ------------------------------------------------- -* ZGS: Generalized Nonsymmetric Eigenvalue Problem -* ZGGES (Schur form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGES', INFO -* -* Blocked version -* - CALL ZDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, - $ RESULT, LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGES3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( ZGX ) THEN -* -* ------------------------------------------------- -* ZGX Generalized Nonsymmetric Eigenvalue Problem -* ZGGESX (Schur form and condition numbers) -* ------------------------------------------------- -* - MAXTYP = 5 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL XLAENV( 5, 2 ) - CALL ZDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ A( 1, 6 ), DC( 1, 1 ), DC( 1, 2 ), C, - $ NCMAX*NCMAX, S, WORK, LWORK, RWORK, IWORK, - $ LIWORK, LOGWRK, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGSX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZGV' ) ) THEN -* -* ------------------------------------------------- -* ZGV: Generalized Nonsymmetric Eigenvalue Problem -* ZGGEV (Eigenvalue/vector form) -* ------------------------------------------------- -* - MAXTYP = 26 - NTYPES = MIN( MAXTYP, NTYPES ) - IF( NTYPES.LE.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), - $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGEV', INFO -* -* Blocked version -* - CALL XLAENV(16,2) - CALL ZDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, - $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), - $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), - $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, - $ RESULT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGEV3', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( ZXV ) THEN -* -* ------------------------------------------------- -* ZXV: Generalized Nonsymmetric Eigenvalue Problem -* ZGGEVX (eigenvalue/vector with condition numbers) -* ------------------------------------------------- -* - MAXTYP = 2 - NTYPES = MAXTYP - IF( NN.LT.0 ) THEN - WRITE( NOUT, FMT = 9990 )C3 - ELSE - IF( TSTERR ) - $ CALL ZERRGG( C3, NOUT ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - CALL ZDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), DC( 1, 1 ), - $ DC( 1, 2 ), A( 1, 5 ), A( 1, 6 ), IWORK( 1 ), - $ IWORK( 2 ), DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), - $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), WORK, - $ LWORK, RWORK, IWORK( 3 ), LIWORK-2, RESULT, - $ LOGWRK, INFO ) -* - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZDRGVX', INFO - END IF - WRITE( NOUT, FMT = 9973 ) - GO TO 10 -* - ELSE IF( LSAMEN( 3, C3, 'ZHB' ) ) THEN -* -* ------------------------------ -* ZHB: Hermitian Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL ZERRST( 'ZHB', NOUT ) -* CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, -* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), -* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, -* $ INFO ) - CALL ZCHKHB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, - $ THRESH, NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), - $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), - $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKHB', INFO -* - ELSE IF( LSAMEN( 3, C3, 'ZBB' ) ) THEN -* -* ------------------------------ -* ZBB: General Band Reduction -* ------------------------------ -* - MAXTYP = 15 - NTYPES = MIN( MAXTYP, NTYPES ) - CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - DO 370 I = 1, NPARMS - NRHS = NSVAL( I ) -* - IF( NEWSD.EQ.0 ) THEN - DO 360 K = 1, 4 - ISEED( K ) = IOLDSD( K ) - 360 CONTINUE - END IF - WRITE( NOUT, FMT = 9966 )C3, NRHS - CALL ZCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, - $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, - $ A( 1, 2 ), 2*NMAX, DR( 1, 1 ), DR( 1, 2 ), - $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), - $ NMAX, A( 1, 7 ), WORK, LWORK, RWORK, RESULT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCHKBB', INFO - 370 CONTINUE -* - ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN -* -* ----------------------------------------- -* GLM: Generalized Linear Regression Model -* ----------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL ZERRGG( 'GLM', NOUT ) - CALL ZCKGLM( NN, NVAL, MVAL, PVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCKGLM', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN -* -* ------------------------------------------ -* GQR: Generalized QR and RQ factorizations -* ------------------------------------------ -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL ZERRGG( 'GQR', NOUT ) - CALL ZCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, - $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, DR( 1, 1 ), NIN, - $ NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCKGQR', INFO -* - ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN -* -* ---------------------------------------------- -* GSV: Generalized Singular Value Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL ZERRGG( 'GSV', NOUT ) - CALL ZCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), ALPHA, BETA, - $ B( 1, 4 ), IWORK, WORK, DR( 1, 1 ), NIN, NOUT, - $ INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCKGSV', INFO -* - ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN -* -* ---------------------------------------------- -* CSD: CS Decomposition -* ---------------------------------------------- -* - CALL XLAENV(1,1) - IF( TSTERR ) - $ CALL ZERRGG( 'CSD', NOUT ) - CALL ZCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), - $ A( 1, 5 ), A( 1, 6 ), RWORK, IWORK, WORK, - $ DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCKCSD', INFO -* - ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN -* -* -------------------------------------- -* LSE: Constrained Linear Least Squares -* -------------------------------------- -* - CALL XLAENV( 1, 1 ) - IF( TSTERR ) - $ CALL ZERRGG( 'LSE', NOUT ) - CALL ZCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, - $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) - IF( INFO.NE.0 ) - $ WRITE( NOUT, FMT = 9980 )'ZCKLSE', INFO - ELSE - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = * ) - WRITE( NOUT, FMT = 9992 )C3 - END IF - IF( .NOT.( ZGX .OR. ZXV ) ) - $ GO TO 190 - 380 CONTINUE - WRITE( NOUT, FMT = 9994 ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) - 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, - $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) - 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', NRHS =', I4 ) - 9994 FORMAT( / / ' End of tests' ) - 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) - 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, - $ ' of input', ' line:', / A79 ) - 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) - 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', - $ I6 ) - 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', - $ I6 ) - 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) - 9986 FORMAT( ' Tests of the Hermitian Eigenvalue Problem routines' ) - 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) - 9984 FORMAT( / ' The following parameter values will be used:' ) - 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) - 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9981 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) - 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' ZGEEV (eigenvalues and eigevectors)' ) - 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', - $ / ' ZGEES (Schur form)' ) - 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' ZGEEVX (eigenvalues, eigenvectors and', - $ ' condition numbers)' ) - 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', - $ ' Driver', / ' ZGEESX (Schur form and condition', - $ ' numbers)' ) - 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem routines' ) - 9974 FORMAT( ' Tests of ZHBTRD', / ' (reduction of a Hermitian band ', - $ 'matrix to real tridiagonal form)' ) - 9973 FORMAT( / 1X, 71( '-' ) ) - 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) - 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', - $ 'routines' ) - 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) - 9969 FORMAT( / ' Tests of the Generalized Singular Value', - $ ' Decomposition routines' ) - 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) - 9967 FORMAT( ' Tests of ZGBBRD', / ' (reduction of a general band ', - $ 'matrix to real bidiagonal form)' ) - 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) - 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver ZGGESX' ) - 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver ZGGES' ) - 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Driver ZGGEV' ) - 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', - $ 'Problem Expert Driver ZGGEVX' ) - 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, - $ ', INMIN=', I4, - $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, - $ ', IACC22 =', I4) - 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) -* -* End of ZCHKEE -* - END From 90bb4ac82100639ea5acf0ac48c409f081eceb48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:49:10 +0100 Subject: [PATCH 126/681] Add rewritten zchkee.F from Reference-LAPACK PR335 --- lapack-netlib/TESTING/EIG/zchkee.F | 2551 ++++++++++++++++++++++++++++ 1 file changed, 2551 insertions(+) create mode 100644 lapack-netlib/TESTING/EIG/zchkee.F diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F new file mode 100644 index 000000000..29604956d --- /dev/null +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -0,0 +1,2551 @@ +*> \brief \b ZCHKEE +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZCHKEE +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKEE tests the COMPLEX*16 LAPACK subroutines for the matrix +*> eigenvalue problem. The test paths in this version are +*> +*> NEP (Nonsymmetric Eigenvalue Problem): +*> Test ZGEHRD, ZUNGHR, ZHSEQR, ZTREVC, ZHSEIN, and ZUNMHR +*> +*> SEP (Hermitian Eigenvalue Problem): +*> Test ZHETRD, ZUNGTR, ZSTEQR, ZSTERF, ZSTEIN, ZSTEDC, +*> and drivers ZHEEV(X), ZHBEV(X), ZHPEV(X), +*> ZHEEVD, ZHBEVD, ZHPEVD +*> +*> SVD (Singular Value Decomposition): +*> Test ZGEBRD, ZUNGBR, and ZBDSQR +*> and the drivers ZGESVD, ZGESDD +*> +*> ZEV (Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test ZGEEV +*> +*> ZES (Nonsymmetric Schur form Driver): +*> Test ZGEES +*> +*> ZVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test ZGEEVX +*> +*> ZSX (Nonsymmetric Schur form Expert Driver): +*> Test ZGEESX +*> +*> ZGG (Generalized Nonsymmetric Eigenvalue Problem): +*> Test ZGGHD3, ZGGBAL, ZGGBAK, ZHGEQZ, and ZTGEVC +*> +*> ZGS (Generalized Nonsymmetric Schur form Driver): +*> Test ZGGES +*> +*> ZGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test ZGGEV +*> +*> ZGX (Generalized Nonsymmetric Schur form Expert Driver): +*> Test ZGGESX +*> +*> ZXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test ZGGEVX +*> +*> ZSG (Hermitian Generalized Eigenvalue Problem): +*> Test ZHEGST, ZHEGV, ZHEGVD, ZHEGVX, ZHPGST, ZHPGV, ZHPGVD, +*> ZHPGVX, ZHBGST, ZHBGV, ZHBGVD, and ZHBGVX +*> +*> ZHB (Hermitian Band Eigenvalue Problem): +*> Test ZHBTRD +*> +*> ZBB (Band Singular Value Decomposition): +*> Test ZGBBRD +*> +*> ZEC (Eigencondition estimation): +*> Test ZTRSYL, ZTREXC, ZTRSNA, and ZTRSEN +*> +*> ZBL (Balancing a general matrix) +*> Test ZGEBAL +*> +*> ZBK (Back transformation on a balanced matrix) +*> Test ZGEBAK +*> +*> ZGL (Balancing a matrix pair) +*> Test ZGGBAL +*> +*> ZGK (Back transformation on a matrix pair) +*> Test ZGGBAK +*> +*> GLM (Generalized Linear Regression Model): +*> Tests ZGGGLM +*> +*> GQR (Generalized QR and RQ factorizations): +*> Tests ZGGQRF and ZGGRQF +*> +*> GSV (Generalized Singular Value Decomposition): +*> Tests ZGGSVD, ZGGSVP, ZTGSJA, ZLAGS2, ZLAPLL, and ZLAPMT +*> +*> CSD (CS decomposition): +*> Tests ZUNCSD +*> +*> LSE (Constrained Linear Least Squares): +*> Tests ZGGLSE +*> +*> Each test path has a different set of inputs, but the data sets for +*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a +*> single input file. The first line of input should contain one of the +*> 3-character path names in columns 1-3. The number of remaining lines +*> depends on what is found on the first line. +*> +*> The number of matrix types used in testing is often controllable from +*> the input file. The number of matrix types for each path, and the +*> test routine that describes them, is as follows: +*> +*> Path name(s) Types Test routine +*> +*> ZHS or NEP 21 ZCHKHS +*> ZST or SEP 21 ZCHKST (routines) +*> 18 ZDRVST (drivers) +*> ZBD or SVD 16 ZCHKBD (routines) +*> 5 ZDRVBD (drivers) +*> ZEV 21 ZDRVEV +*> ZES 21 ZDRVES +*> ZVX 21 ZDRVVX +*> ZSX 21 ZDRVSX +*> ZGG 26 ZCHKGG (routines) +*> ZGS 26 ZDRGES +*> ZGX 5 ZDRGSX +*> ZGV 26 ZDRGEV +*> ZXV 2 ZDRGVX +*> ZSG 21 ZDRVSG +*> ZHB 15 ZCHKHB +*> ZBB 15 ZCHKBB +*> ZEC - ZCHKEC +*> ZBL - ZCHKBL +*> ZBK - ZCHKBK +*> ZGL - ZCHKGL +*> ZGK - ZCHKGK +*> GLM 8 ZCKGLM +*> GQR 8 ZCKGQR +*> GSV 8 ZCKGSV +*> CSD 3 ZCKCSD +*> LSE 8 ZCKLSE +*> +*>----------------------------------------------------------------------- +*> +*> NEP input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NX, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: INMIN, INTEGER array, dimension (NPARMS) +*> LAHQR vs TTQRE crossover point, >= 11 +*> +*> line 9: INWIN, INTEGER array, dimension (NPARMS) +*> recommended deflation window size +*> +*> line 10: INIBL, INTEGER array, dimension (NPARMS) +*> nibble crossover point +*> +*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) +*> number of simultaneous shifts) +*> +*> line 12: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 0, 1 or 2) +*> +*> line 13: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. To have all of the test +*> ratios printed, use THRESH = 0.0 . +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow +*> the user to specify the matrix types. Each line contains +*> a 3-character path name in columns 1-3, and the number +*> of matrix types must be the first nonblank item in columns +*> 4-80. If the number of matrix types is at least 1 but is +*> less than the maximum number of possible types, a second +*> line will be read to get the numbers of the matrix types to +*> be used. For example, +*> NEP 21 +*> requests all of the matrix types for the nonsymmetric +*> eigenvalue problem, while +*> NEP 4 +*> 9 10 11 12 +*> requests only matrices of type 9, 10, 11, and 12. +*> +*> The valid 3-character path names are 'NEP' or 'ZHS' for the +*> nonsymmetric eigenvalue routines. +*> +*>----------------------------------------------------------------------- +*> +*> SEP or ZSG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, and NX. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 10: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 11: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 12: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 12 was 2: +*> +*> line 13: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 13-EOF: Lines specifying matrix types, as for NEP. +*> The valid 3-character path names are 'SEP' or 'ZST' for the +*> Hermitian eigenvalue routines and driver routines, and +*> 'ZSG' for the routines for the Hermitian generalized +*> eigenvalue problem. +*> +*>----------------------------------------------------------------------- +*> +*> SVD input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 5: NPARMS, INTEGER +*> Number of values of the parameter NB, NBMIN, NX, and NRHS. +*> +*> line 6: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 7: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 8: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 9: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 10: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 11: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 12: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 13: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SVD' or 'ZBD' for both the +*> SVD routines and the SVD driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> ZEV and ZES data files: +*> +*> line 1: 'ZEV' or 'ZES' in columns 1 to 3. +*> +*> line 2: NSIZES, INTEGER +*> Number of sizes of matrices to use. Should be at least 0 +*> and at most 20. If NSIZES = 0, no testing is done +*> (although the remaining 3 lines are still read). +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHSEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8 and following: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'ZEV' to test CGEEV, or +*> 'ZES' to test CGEES. +*> +*>----------------------------------------------------------------------- +*> +*> The ZVX data has two parts. The first part is identical to ZEV, +*> and the second part consists of test matrices with precomputed +*> solutions. +*> +*> line 1: 'ZVX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: NEWSD, INTEGER +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> +*> lines 8 and following: The first line contains 'ZVX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 1+N+N**2 lines, where N is +*> its dimension. The first line contains the dimension N and +*> ISRT (two integers). ISRT indicates whether the last N lines +*> are sorted by increasing real part of the eigenvalue +*> (ISRT=0) or by increasing imaginary part (ISRT=1). The next +*> N**2 lines contain the matrix rowwise, one entry per line. +*> The last N lines correspond to each eigenvalue. Each of +*> these last N lines contains 4 real values: the real part of +*> the eigenvalues, the imaginary part of the eigenvalue, the +*> reciprocal condition number of the eigenvalues, and the +*> reciprocal condition number of the vector eigenvector. The +*> end of data is indicated by dimension N=0. Even if no data +*> is to be tested, there must be at least one line containing +*> N=0. +*> +*>----------------------------------------------------------------------- +*> +*> The ZSX data is like ZVX. The first part is identical to ZEV, and the +*> second part consists of test matrices with precomputed solutions. +*> +*> line 1: 'ZSX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: NEWSD, INTEGER +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> +*> lines 8 and following: The first line contains 'ZSX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 3+N**2 lines, where N is +*> its dimension. The first line contains the dimension N, the +*> dimension M of an invariant subspace, and ISRT. The second +*> line contains M integers, identifying the eigenvalues in the +*> invariant subspace (by their position in a list of +*> eigenvalues ordered by increasing real part (if ISRT=0) or +*> by increasing imaginary part (if ISRT=1)). The next N**2 +*> lines contain the matrix rowwise. The last line contains the +*> reciprocal condition number for the average of the selected +*> eigenvalues, and the reciprocal condition number for the +*> corresponding right invariant subspace. The end of data in +*> indicated by a line containing N=0, M=0, and ISRT = 0. Even +*> if no data is to be tested, there must be at least one line +*> containing N=0, M=0 and ISRT=0. +*> +*>----------------------------------------------------------------------- +*> +*> ZGG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NBCOL, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for NBMIN, the minimum row dimension for blocks. +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of shifts. +*> +*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) +*> The values for MAXB, used in determining minimum blocksize. +*> +*> line 9: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 1 or 2) +*> +*> line 10: NBCOL, INTEGER array, dimension (NPARMS) +*> The values for NBCOL, the minimum column dimension for +*> blocks. +*> +*> line 11: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 12: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 13: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 14: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 15: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 15 was 2: +*> +*> line 16: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 17-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'ZGG' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> ZGS and ZGV input files: +*> +*> line 1: 'ZGS' or 'ZGV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension(NN) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 17 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 7-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'ZGS' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> ZGX input file: +*> line 1: 'ZGX' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next line contains an integer k such +*> that only the last k eigenvalues will be selected and appear +*> in the leading diagonal blocks of $A$ and $B$. The next N*N +*> lines contain the matrix A, one element per line. The next N*N +*> lines contain the matrix B. The last line contains the +*> reciprocal of the eigenvalue cluster condition number and the +*> reciprocal of the deflating subspace (associated with the +*> selected eigencluster) condition number. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> ZXV input files: +*> line 1: 'ZXV' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N*N lines contain the matrix A, one +*> element per line. The next N*N lines contain the matrix B. +*> The next line contains the reciprocals of the eigenvalue +*> condition numbers. The last line contains the reciprocals of +*> the eigenvector condition numbers. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> ZHB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix dimension K. +*> +*> line 6: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'ZHB'. +*> +*>----------------------------------------------------------------------- +*> +*> ZBB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix bandwidth K. +*> +*> line 6: NPARMS, INTEGER +*> Number of values of the parameter NRHS +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 9 was 2: +*> +*> line 10: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 10-EOF: Lines specifying matrix types, as for SVD. +*> The 3-character path name is 'ZBB'. +*> +*>----------------------------------------------------------------------- +*> +*> ZEC input file: +*> +*> line 2: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> lines 3-EOF: +*> +*> Input for testing the eigencondition routines consists of a set of +*> specially constructed test cases and their solutions. The data +*> format is not intended to be modified by the user. +*> +*>----------------------------------------------------------------------- +*> +*> ZBL and ZBK input files: +*> +*> line 1: 'ZBL' in columns 1-3 to test CGEBAL, or 'ZBK' in +*> columns 1-3 to test CGEBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> ZGL and ZGK input files: +*> +*> line 1: 'ZGL' in columns 1-3 to test ZGGBAL, or 'ZGK' in +*> columns 1-3 to test ZGGBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> GLM data file: +*> +*> line 1: 'GLM' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension), note M <= N <= M+P. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GLM' for the generalized +*> linear regression model routines. +*> +*>----------------------------------------------------------------------- +*> +*> GQR data file: +*> +*> line 1: 'GQR' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GQR' for the generalized +*> QR and RQ routines. +*> +*>----------------------------------------------------------------------- +*> +*> GSV data file: +*> +*> line 1: 'GSV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> CSD data file: +*> +*> line 1: 'CSD' in columns 1 to 3. +*> +*> line 2: NM, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NM) +*> Values of M (row and column dimension of orthogonal matrix). +*> +*> line 4: PVAL, INTEGER array, dimension(NM) +*> Values of P (row dimension of top-left block). +*> +*> line 5: NVAL, INTEGER array, dimension(NM) +*> Values of N (column dimension of top-left block). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CSD' for the CSD routine. +*> +*>----------------------------------------------------------------------- +*> +*> LSE data file: +*> +*> line 1: 'LSE' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N, note P <= N <= P+M. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> NMAX is currently set to 132 and must be at least 12 for some of the +*> precomputed examples, and LWORK = NMAX*(5*NMAX+20) in the parameter +*> statements below. For SVD, we assume NRHS may be as big as N. The +*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for ZGG. +*> \endverbatim +* +* Arguments: +* ========== +* +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complex16_eig +* +* ===================================================================== + PROGRAM ZCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER NCMAX + PARAMETER ( NCMAX = 20 ) + INTEGER NEED + PARAMETER ( NEED = 14 ) + INTEGER LWORK + PARAMETER ( LWORK = NMAX*( 5*NMAX+20 ) ) + INTEGER LIWORK + PARAMETER ( LIWORK = NMAX*( NMAX+20 ) ) + INTEGER MAXIN + PARAMETER ( MAXIN = 20 ) + INTEGER MAXT + PARAMETER ( MAXT = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) +* .. +* .. Local Scalars .. + LOGICAL ZBK, ZBL, ZES, ZEV, ZGK, ZGL, ZGS, ZGV, ZGX, + $ ZSX, ZVX, ZXV, CSD, FATAL, GLM, GQR, GSV, LSE, + $ NEP, SEP, SVD, TSTCHK, TSTDIF, TSTDRV, TSTERR, + $ ZBB, ZGG, ZHB + CHARACTER C1 + CHARACTER*3 C3, PATH + CHARACTER*32 VNAME + CHARACTER*10 INTSTR + CHARACTER*80 LINE + INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, + $ NK, NN, NPARMS, NRHS, NTYPES, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) + INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), + $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), + $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ PVAL( MAXIN ) + INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), + $ ISHFTS( MAXIN ), IACC22( MAXIN ) + DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), + $ RESULT( 500 ) + COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), + $ X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. +* .. External Functions .. + LOGICAL LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, XLAENV, ZCHKBB, ZCHKBD, ZCHKBK, ZCHKBL, + $ ZCHKEC, ZCHKGG, ZCHKGK, ZCHKGL, ZCHKHB, ZCHKHS, + $ ZCHKST, ZCKCSD, ZCKGLM, ZCKGQR, ZCKGSV, ZCKLSE, + $ ZDRGES, ZDRGEV, ZDRGSX, ZDRGVX, ZDRVBD, ZDRVES, + $ ZDRVEV, ZDRVSG, ZDRVST, ZDRVSX, ZDRVVX, + $ ZERRBD, ZERRED, ZERRGG, ZERRHS, ZERRST, ILAVER, + $ ZDRGES3, ZDRGEV3, + $ ZCHKST2STG, ZDRVST2STG, ZCHKHB2STG +* .. +* .. Intrinsic Functions .. + INTRINSIC LEN, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, + $ SELOPT +* .. +* .. Arrays in Common .. + LOGICAL SELVAL( 20 ) + INTEGER IPARMS( 100 ) + DOUBLE PRECISION SELWI( 20 ), SELWR( 20 ) +* .. +* .. Common blocks .. + COMMON / CENVIR / NPROC, NSHIFT, MAXB + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA INTSTR / '0123456789' / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + A = 0.0 + B = 0.0 + C = 0.0 + DC = 0.0 + S1 = DSECND( ) + FATAL = .FALSE. + NUNIT = NOUT +* +* Return to here to read multiple sets of data +* + 10 CONTINUE +* +* Read the first line and set the 3-character test path +* + READ( NIN, FMT = '(A80)', END = 380 )LINE + PATH = LINE( 1: 3 ) + NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'ZHS' ) + SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'ZST' ) .OR. + $ LSAMEN( 3, PATH, 'ZSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) + SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'ZBD' ) + ZEV = LSAMEN( 3, PATH, 'ZEV' ) + ZES = LSAMEN( 3, PATH, 'ZES' ) + ZVX = LSAMEN( 3, PATH, 'ZVX' ) + ZSX = LSAMEN( 3, PATH, 'ZSX' ) + ZGG = LSAMEN( 3, PATH, 'ZGG' ) + ZGS = LSAMEN( 3, PATH, 'ZGS' ) + ZGX = LSAMEN( 3, PATH, 'ZGX' ) + ZGV = LSAMEN( 3, PATH, 'ZGV' ) + ZXV = LSAMEN( 3, PATH, 'ZXV' ) + ZHB = LSAMEN( 3, PATH, 'ZHB' ) + ZBB = LSAMEN( 3, PATH, 'ZBB' ) + GLM = LSAMEN( 3, PATH, 'GLM' ) + GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) + GSV = LSAMEN( 3, PATH, 'GSV' ) + CSD = LSAMEN( 3, PATH, 'CSD' ) + LSE = LSAMEN( 3, PATH, 'LSE' ) + ZBL = LSAMEN( 3, PATH, 'ZBL' ) + ZBK = LSAMEN( 3, PATH, 'ZBK' ) + ZGL = LSAMEN( 3, PATH, 'ZGL' ) + ZGK = LSAMEN( 3, PATH, 'ZGK' ) +* +* Report values of parameters. +* + IF( PATH.EQ.' ' ) THEN + GO TO 10 + ELSE IF( NEP ) THEN + WRITE( NOUT, FMT = 9987 ) + ELSE IF( SEP ) THEN + WRITE( NOUT, FMT = 9986 ) + ELSE IF( SVD ) THEN + WRITE( NOUT, FMT = 9985 ) + ELSE IF( ZEV ) THEN + WRITE( NOUT, FMT = 9979 ) + ELSE IF( ZES ) THEN + WRITE( NOUT, FMT = 9978 ) + ELSE IF( ZVX ) THEN + WRITE( NOUT, FMT = 9977 ) + ELSE IF( ZSX ) THEN + WRITE( NOUT, FMT = 9976 ) + ELSE IF( ZGG ) THEN + WRITE( NOUT, FMT = 9975 ) + ELSE IF( ZGS ) THEN + WRITE( NOUT, FMT = 9964 ) + ELSE IF( ZGX ) THEN + WRITE( NOUT, FMT = 9965 ) + ELSE IF( ZGV ) THEN + WRITE( NOUT, FMT = 9963 ) + ELSE IF( ZXV ) THEN + WRITE( NOUT, FMT = 9962 ) + ELSE IF( ZHB ) THEN + WRITE( NOUT, FMT = 9974 ) + ELSE IF( ZBB ) THEN + WRITE( NOUT, FMT = 9967 ) + ELSE IF( GLM ) THEN + WRITE( NOUT, FMT = 9971 ) + ELSE IF( GQR ) THEN + WRITE( NOUT, FMT = 9970 ) + ELSE IF( GSV ) THEN + WRITE( NOUT, FMT = 9969 ) + ELSE IF( CSD ) THEN + WRITE( NOUT, FMT = 9960 ) + ELSE IF( LSE ) THEN + WRITE( NOUT, FMT = 9968 ) + ELSE IF( ZBL ) THEN +* +* ZGEBAL: Balancing +* + CALL ZCHKBL( NIN, NOUT ) + GO TO 380 + ELSE IF( ZBK ) THEN +* +* ZGEBAK: Back transformation +* + CALL ZCHKBK( NIN, NOUT ) + GO TO 380 + ELSE IF( ZGL ) THEN +* +* ZGGBAL: Balancing +* + CALL ZCHKGL( NIN, NOUT ) + GO TO 380 + ELSE IF( ZGK ) THEN +* +* ZGGBAK: Back transformation +* + CALL ZCHKGK( NIN, NOUT ) + GO TO 380 + ELSE IF( LSAMEN( 3, PATH, 'ZEC' ) ) THEN +* +* ZEC: Eigencondition estimation +* + READ( NIN, FMT = * )THRESH + CALL XLAENV( 1, 1 ) + CALL XLAENV( 12, 1 ) + TSTERR = .TRUE. + CALL ZCHKEC( THRESH, TSTERR, NIN, NOUT ) + GO TO 380 + ELSE + WRITE( NOUT, FMT = 9992 )PATH + GO TO 380 + END IF + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH + WRITE( NOUT, FMT = 9984 ) +* +* Read the number of values of M, P, and N. +* + READ( NIN, FMT = * )NN + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF +* +* Read the values of M +* + IF( .NOT.( ZGX .OR. ZXV ) ) THEN + READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) + IF( SVD ) THEN + VNAME = ' M ' + ELSE + VNAME = ' N ' + END IF + DO 20 I = 1, NN + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) + END IF +* +* Read the values of P +* + IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN + READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) + DO 30 I = 1, NN + IF( PVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( PVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX + FATAL = .TRUE. + END IF + 30 CONTINUE + WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) + END IF +* +* Read the values of N +* + IF( SVD .OR. ZBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. + $ LSE ) THEN + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 40 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 40 CONTINUE + ELSE + DO 50 I = 1, NN + NVAL( I ) = MVAL( I ) + 50 CONTINUE + END IF + IF( .NOT.( ZGX .OR. ZXV ) ) THEN + WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) + ELSE + WRITE( NOUT, FMT = 9983 )'N: ', NN + END IF +* +* Read the number of values of K, followed by the values of K +* + IF( ZHB .OR. ZBB ) THEN + READ( NIN, FMT = * )NK + READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) + DO 60 I = 1, NK + IF( KVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( KVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX + FATAL = .TRUE. + END IF + 60 CONTINUE + WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) + END IF +* + IF( ZEV .OR. ZES .OR. ZVX .OR. ZSX ) THEN +* +* For the nonsymmetric QR driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INWIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INIBL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( ISHFTS( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 + FATAL = .TRUE. + ELSE IF( IACC22( 1 ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) + CALL XLAENV(13, INWIN( 1 ) ) + CALL XLAENV(14, INIBL( 1 ) ) + CALL XLAENV(15, ISHFTS( 1 ) ) + CALL XLAENV(16, IACC22( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) + WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) +* + ELSE IF( ZGS .OR. ZGX .OR. ZGV .OR. ZXV ) THEN +* +* For the nonsymmetric generalized driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ NSVAL( 1 ), MXBVAL( 1 ) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NSVAL( 1 ).LT.2 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 + FATAL = .TRUE. + ELSE IF( MXBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV( 4, NSVAL( 1 ) ) + CALL XLAENV( 8, MXBVAL( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) + ELSE IF( .NOT.ZHB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. + $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN +* +* For the other paths, the number of parameters can be varied +* from the input file. Read the number of parameter values. +* + READ( NIN, FMT = * )NPARMS + IF( NPARMS.LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 + NPARMS = 0 + FATAL = .TRUE. + ELSE IF( NPARMS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN + NPARMS = 0 + FATAL = .TRUE. + END IF +* +* Read the values of NB +* + IF( .NOT.ZBB ) THEN + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) + DO 70 I = 1, NPARMS + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 70 CONTINUE + WRITE( NOUT, FMT = 9983 )'NB: ', + $ ( NBVAL( I ), I = 1, NPARMS ) + END IF +* +* Read the values of NBMIN +* + IF( NEP .OR. SEP .OR. SVD .OR. ZGG ) THEN + READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) + DO 80 I = 1, NPARMS + IF( NBMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 + FATAL = .TRUE. + ELSE IF( NBMIN( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX + FATAL = .TRUE. + END IF + 80 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBMIN:', + $ ( NBMIN( I ), I = 1, NPARMS ) + ELSE + DO 90 I = 1, NPARMS + NBMIN( I ) = 1 + 90 CONTINUE + END IF +* +* Read the values of NX +* + IF( NEP .OR. SEP .OR. SVD ) THEN + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) + DO 100 I = 1, NPARMS + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NXVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX + FATAL = .TRUE. + END IF + 100 CONTINUE + WRITE( NOUT, FMT = 9983 )'NX: ', + $ ( NXVAL( I ), I = 1, NPARMS ) + ELSE + DO 110 I = 1, NPARMS + NXVAL( I ) = 1 + 110 CONTINUE + END IF +* +* Read the values of NSHIFT (if ZGG) or NRHS (if SVD +* or ZBB). +* + IF( SVD .OR. ZBB .OR. ZGG ) THEN + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) + DO 120 I = 1, NPARMS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX + FATAL = .TRUE. + END IF + 120 CONTINUE + WRITE( NOUT, FMT = 9983 )'NS: ', + $ ( NSVAL( I ), I = 1, NPARMS ) + ELSE + DO 130 I = 1, NPARMS + NSVAL( I ) = 1 + 130 CONTINUE + END IF +* +* Read the values for MAXB. +* + IF( ZGG ) THEN + READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) + DO 140 I = 1, NPARMS + IF( MXBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MXBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 140 CONTINUE + WRITE( NOUT, FMT = 9983 )'MAXB: ', + $ ( MXBVAL( I ), I = 1, NPARMS ) + ELSE + DO 150 I = 1, NPARMS + MXBVAL( I ) = 1 + 150 CONTINUE + END IF +* +* Read the values for INMIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) + DO 540 I = 1, NPARMS + IF( INMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 + FATAL = .TRUE. + END IF + 540 CONTINUE + WRITE( NOUT, FMT = 9983 )'INMIN: ', + $ ( INMIN( I ), I = 1, NPARMS ) + ELSE + DO 550 I = 1, NPARMS + INMIN( I ) = 1 + 550 CONTINUE + END IF +* +* Read the values for INWIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) + DO 560 I = 1, NPARMS + IF( INWIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 + FATAL = .TRUE. + END IF + 560 CONTINUE + WRITE( NOUT, FMT = 9983 )'INWIN: ', + $ ( INWIN( I ), I = 1, NPARMS ) + ELSE + DO 570 I = 1, NPARMS + INWIN( I ) = 1 + 570 CONTINUE + END IF +* +* Read the values for INIBL. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) + DO 580 I = 1, NPARMS + IF( INIBL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 + FATAL = .TRUE. + END IF + 580 CONTINUE + WRITE( NOUT, FMT = 9983 )'INIBL: ', + $ ( INIBL( I ), I = 1, NPARMS ) + ELSE + DO 590 I = 1, NPARMS + INIBL( I ) = 1 + 590 CONTINUE + END IF +* +* Read the values for ISHFTS. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) + DO 600 I = 1, NPARMS + IF( ISHFTS( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 + FATAL = .TRUE. + END IF + 600 CONTINUE + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', + $ ( ISHFTS( I ), I = 1, NPARMS ) + ELSE + DO 610 I = 1, NPARMS + ISHFTS( I ) = 1 + 610 CONTINUE + END IF +* +* Read the values for IACC22. +* + IF( NEP .OR. ZGG ) THEN + READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) + DO 620 I = 1, NPARMS + IF( IACC22( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 + FATAL = .TRUE. + END IF + 620 CONTINUE + WRITE( NOUT, FMT = 9983 )'IACC22: ', + $ ( IACC22( I ), I = 1, NPARMS ) + ELSE + DO 630 I = 1, NPARMS + IACC22( I ) = 1 + 630 CONTINUE + END IF +* +* Read the values for NBCOL. +* + IF( ZGG ) THEN + READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) + DO 160 I = 1, NPARMS + IF( NBCOL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBCOL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX + FATAL = .TRUE. + END IF + 160 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBCOL:', + $ ( NBCOL( I ), I = 1, NPARMS ) + ELSE + DO 170 I = 1, NPARMS + NBCOL( I ) = 1 + 170 CONTINUE + END IF + END IF +* +* Calculate and print the machine dependent constants. +* + WRITE( NOUT, FMT = * ) + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9981 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9981 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9981 )'precision', EPS +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9982 )THRESH + IF( SEP .OR. SVD .OR. ZGG ) THEN +* +* Read the flag that indicates whether to test LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test driver routines. +* + READ( NIN, FMT = * )TSTDRV + END IF +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* +* Read the code describing how to set the random number seed. +* + READ( NIN, FMT = * )NEWSD +* +* If NEWSD = 2, read another line with 4 integers for the seed. +* + IF( NEWSD.EQ.2 ) + $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) +* + DO 180 I = 1, 4 + ISEED( I ) = IOLDSD( I ) + 180 CONTINUE +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Read the input lines indicating the test path and its parameters. +* The first three characters indicate the test path, and the number +* of test matrix types must be the first nonblank item in columns +* 4-80. +* + 190 CONTINUE +* + IF( .NOT.( ZGX .OR. ZXV ) ) THEN +* + 200 CONTINUE + READ( NIN, FMT = '(A80)', END = 380 )LINE + C3 = LINE( 1: 3 ) + LENP = LEN( LINE ) + I = 3 + ITMP = 0 + I1 = 0 + 210 CONTINUE + I = I + 1 + IF( I.GT.LENP ) THEN + IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + NTYPES = MAXT + GO TO 240 + END IF + END IF + IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN + I1 = I + C1 = LINE( I1: I1 ) +* +* Check that a valid integer was read +* + DO 220 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 230 + END IF + 220 CONTINUE + WRITE( NOUT, FMT = 9991 )I, LINE + GO TO 200 + 230 CONTINUE + ITMP = 10*ITMP + IC + GO TO 210 + ELSE IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + GO TO 210 + END IF + 240 CONTINUE + NTYPES = ITMP +* +* Skip the tests if NTYPES is <= 0. +* + IF( .NOT.( ZEV .OR. ZES .OR. ZVX .OR. ZSX .OR. ZGV .OR. + $ ZGS ) .AND. NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + GO TO 200 + END IF +* + ELSE + IF( ZGX ) + $ C3 = 'ZGX' + IF( ZXV ) + $ C3 = 'ZXV' + END IF +* +* Reset the random number seed. +* + IF( NEWSD.EQ.0 ) THEN + DO 250 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 250 CONTINUE + END IF +* + IF( LSAMEN( 3, C3, 'ZHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN +* +* ------------------------------------- +* NEP: Nonsymmetric Eigenvalue Problem +* ------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NS = number of shifts +* MAXB = minimum submatrix size +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL ZERRHS( 'ZHSEQR', NOUT ) + DO 270 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) + CALL XLAENV(13, INWIN( I ) ) + CALL XLAENV(14, INIBL( I ) ) + CALL XLAENV(15, ISHFTS( I ) ) + CALL XLAENV(16, IACC22( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 260 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 260 CONTINUE + END IF + WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), MAX( 11, INMIN(I)), + $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) + CALL ZCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), + $ A( 1, 7 ), DC( 1, 1 ), DC( 1, 2 ), A( 1, 8 ), + $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ DC( 1, 3 ), WORK, LWORK, RWORK, IWORK, LOGWRK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKHS', INFO + 270 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'ZST' ) .OR. LSAMEN( 3, C3, 'SEP' ) + $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN +* +* ---------------------------------- +* SEP: Symmetric Eigenvalue Problem +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF + DO 290 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 280 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 280 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL ZCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), + $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), + $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + ELSE + CALL ZCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), + $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), + $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKST', INFO + END IF + IF( TSTDRV ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL ZDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), + $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), + $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL ZDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), + $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), + $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRVST', INFO + END IF + 290 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'ZSG' ) ) THEN +* +* ---------------------------------------------- +* ZSG: Hermitian Generalized Eigenvalue Problem +* ---------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) + DO 310 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 300 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 300 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN +* CALL ZDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, +* $ DR( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), +* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, +* $ LWORK, RWORK, LWORK, IWORK, LIWORK, RESULT, +* $ INFO ) + CALL ZDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ DR( 1, 3 ), DR( 1, 4 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), + $ A( 1, 7 ), WORK, LWORK, RWORK, LWORK, + $ IWORK, LIWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRVSG', INFO + END IF + 310 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'ZBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN +* +* ---------------------------------- +* SVD: Singular Value Decomposition +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NRHS = number of right hand sides +* + MAXTYP = 16 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) +* +* Test the error exits +* + CALL XLAENV( 1, 1 ) + IF( TSTERR .AND. TSTCHK ) + $ CALL ZERRBD( 'ZBD', NOUT ) + IF( TSTERR .AND. TSTDRV ) + $ CALL ZERRED( 'ZBD', NOUT ) +* + DO 330 I = 1, NPARMS + NRHS = NSVAL( I ) + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + IF( NEWSD.EQ.0 ) THEN + DO 320 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 320 CONTINUE + END IF + WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), NRHS + IF( TSTCHK ) THEN + CALL ZCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, + $ THRESH, A( 1, 1 ), NMAX, DR( 1, 1 ), + $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), + $ A( 1, 2 ), NMAX, A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), NMAX, A( 1, 6 ), NMAX, A( 1, 7 ), + $ A( 1, 8 ), WORK, LWORK, RWORK, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKBD', INFO + END IF + IF( TSTDRV ) + $ CALL ZDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), DR( 1, 1 ), DR( 1, 2 ), + $ DR( 1, 3 ), WORK, LWORK, RWORK, IWORK, NOUT, + $ INFO ) + 330 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'ZEV' ) ) THEN +* +* -------------------------------------------- +* ZEV: Nonsymmetric Eigenvalue Problem Driver +* ZGEEV (eigenvalues and eigenvectors) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, + $ A( 1, 5 ), NMAX, RESULT, WORK, LWORK, RWORK, + $ IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZGEEV', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZES' ) ) THEN +* +* -------------------------------------------- +* ZES: Nonsymmetric Eigenvalue Problem Driver +* ZGEES (Schur form) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ DC( 1, 1 ), DC( 1, 2 ), A( 1, 4 ), NMAX, + $ RESULT, WORK, LWORK, RWORK, IWORK, LOGWRK, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZGEES', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZVX' ) ) THEN +* +* -------------------------------------------------------------- +* ZVX: Nonsymmetric Eigenvalue Problem Expert Driver +* ZGEEVX (eigenvalues, eigenvectors and condition numbers) +* -------------------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, + $ A( 1, 5 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), + $ DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), RESULT, WORK, LWORK, + $ RWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZGEEVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZSX' ) ) THEN +* +* --------------------------------------------------- +* ZSX: Nonsymmetric Eigenvalue Problem Expert Driver +* ZGEESX (Schur form and condition numbers) +* --------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), A( 1, 4 ), + $ NMAX, A( 1, 5 ), RESULT, WORK, LWORK, RWORK, + $ LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZGEESX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZGG' ) ) THEN +* +* ------------------------------------------------- +* ZGG: Generalized Nonsymmetric Eigenvalue Problem +* ------------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NS = number of shifts +* MAXB = minimum submatrix size +* IACC22: structured matrix multiply +* NBCOL = minimum column dimension for blocks +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV(1,1) + IF( TSTCHK .AND. TSTERR ) + $ CALL ZERRGG( C3, NOUT ) + DO 350 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 4, NSVAL( I ) ) + CALL XLAENV( 8, MXBVAL( I ) ) + CALL XLAENV( 16, IACC22( I ) ) + CALL XLAENV( 5, NBCOL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 340 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 340 CONTINUE + END IF + WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), + $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) + TSTDIF = .FALSE. + THRSHN = 10.D0 + IF( TSTCHK ) THEN + CALL ZCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), + $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), + $ DC( 1, 4 ), A( 1, 13 ), A( 1, 14 ), WORK, + $ LWORK, RWORK, LOGWRK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKGG', INFO + END IF + 350 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'ZGS' ) ) THEN +* +* ------------------------------------------------- +* ZGS: Generalized Nonsymmetric Eigenvalue Problem +* ZGGES (Schur form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGES', INFO +* +* Blocked version +* + CALL ZDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGES3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( ZGX ) THEN +* +* ------------------------------------------------- +* ZGX Generalized Nonsymmetric Eigenvalue Problem +* ZGGESX (Schur form and condition numbers) +* ------------------------------------------------- +* + MAXTYP = 5 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 5, 2 ) + CALL ZDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), DC( 1, 1 ), DC( 1, 2 ), C, + $ NCMAX*NCMAX, S, WORK, LWORK, RWORK, IWORK, + $ LIWORK, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGSX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZGV' ) ) THEN +* +* ------------------------------------------------- +* ZGV: Generalized Nonsymmetric Eigenvalue Problem +* ZGGEV (Eigenvalue/vector form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), + $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGEV', INFO +* +* Blocked version +* + CALL XLAENV(16,2) + CALL ZDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), + $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGEV3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( ZXV ) THEN +* +* ------------------------------------------------- +* ZXV: Generalized Nonsymmetric Eigenvalue Problem +* ZGGEVX (eigenvalue/vector with condition numbers) +* ------------------------------------------------- +* + MAXTYP = 2 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL ZERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL ZDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 5 ), A( 1, 6 ), IWORK( 1 ), + $ IWORK( 2 ), DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), WORK, + $ LWORK, RWORK, IWORK( 3 ), LIWORK-2, RESULT, + $ LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZDRGVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'ZHB' ) ) THEN +* +* ------------------------------ +* ZHB: Hermitian Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL ZERRST( 'ZHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF +* CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), +* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, +* $ INFO ) + CALL ZCHKHB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), + $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), + $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKHB', INFO +* + ELSE IF( LSAMEN( 3, C3, 'ZBB' ) ) THEN +* +* ------------------------------ +* ZBB: General Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + DO 370 I = 1, NPARMS + NRHS = NSVAL( I ) +* + IF( NEWSD.EQ.0 ) THEN + DO 360 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 360 CONTINUE + END IF + WRITE( NOUT, FMT = 9966 )C3, NRHS + CALL ZCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, + $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), 2*NMAX, DR( 1, 1 ), DR( 1, 2 ), + $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), + $ NMAX, A( 1, 7 ), WORK, LWORK, RWORK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCHKBB', INFO + 370 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN +* +* ----------------------------------------- +* GLM: Generalized Linear Regression Model +* ----------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL ZERRGG( 'GLM', NOUT ) + CALL ZCKGLM( NN, NVAL, MVAL, PVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCKGLM', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN +* +* ------------------------------------------ +* GQR: Generalized QR and RQ factorizations +* ------------------------------------------ +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL ZERRGG( 'GQR', NOUT ) + CALL ZCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, + $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, DR( 1, 1 ), NIN, + $ NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCKGQR', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN +* +* ---------------------------------------------- +* GSV: Generalized Singular Value Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL ZERRGG( 'GSV', NOUT ) + CALL ZCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), ALPHA, BETA, + $ B( 1, 4 ), IWORK, WORK, DR( 1, 1 ), NIN, NOUT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCKGSV', INFO +* + ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN +* +* ---------------------------------------------- +* CSD: CS Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL ZERRGG( 'CSD', NOUT ) + CALL ZCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), A( 1, 6 ), RWORK, IWORK, WORK, + $ DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCKCSD', INFO +* + ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN +* +* -------------------------------------- +* LSE: Constrained Linear Least Squares +* -------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL ZERRGG( 'LSE', NOUT ) + CALL ZCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'ZCKLSE', INFO + ELSE + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9992 )C3 + END IF + IF( .NOT.( ZGX .OR. ZXV ) ) + $ GO TO 190 + 380 CONTINUE + WRITE( NOUT, FMT = 9994 ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) + 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, + $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) + 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', NRHS =', I4 ) + 9994 FORMAT( / / ' End of tests' ) + 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) + 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, + $ ' of input', ' line:', / A79 ) + 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) + 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', + $ I6 ) + 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', + $ I6 ) + 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) + 9986 FORMAT( ' Tests of the Hermitian Eigenvalue Problem routines' ) + 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) + 9984 FORMAT( / ' The following parameter values will be used:' ) + 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) + 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9981 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) + 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' ZGEEV (eigenvalues and eigevectors)' ) + 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' ZGEES (Schur form)' ) + 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' ZGEEVX (eigenvalues, eigenvectors and', + $ ' condition numbers)' ) + 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' ZGEESX (Schur form and condition', + $ ' numbers)' ) + 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem routines' ) + 9974 FORMAT( ' Tests of ZHBTRD', / ' (reduction of a Hermitian band ', + $ 'matrix to real tridiagonal form)' ) + 9973 FORMAT( / 1X, 71( '-' ) ) + 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) + 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', + $ 'routines' ) + 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) + 9969 FORMAT( / ' Tests of the Generalized Singular Value', + $ ' Decomposition routines' ) + 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) + 9967 FORMAT( ' Tests of ZGBBRD', / ' (reduction of a general band ', + $ 'matrix to real bidiagonal form)' ) + 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) + 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver ZGGESX' ) + 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver ZGGES' ) + 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver ZGGEV' ) + 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver ZGGEVX' ) + 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', INMIN=', I4, + $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, + $ ', IACC22 =', I4) + 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) +* +* End of ZCHKEE +* + END From a5ab891292052b5c6ea58ba47e3f58efc5043ce3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:49:50 +0100 Subject: [PATCH 127/681] Add rewritten schkee.F from Reference-LAPACK PR335 --- lapack-netlib/TESTING/EIG/schkee.F | 2541 ++++++++++++++++++++++++++++ 1 file changed, 2541 insertions(+) create mode 100644 lapack-netlib/TESTING/EIG/schkee.F diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F new file mode 100644 index 000000000..a063c18b5 --- /dev/null +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -0,0 +1,2541 @@ +*> \brief \b SCHKEE +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SCHKEE +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKEE tests the REAL LAPACK subroutines for the matrix +*> eigenvalue problem. The test paths in this version are +*> +*> NEP (Nonsymmetric Eigenvalue Problem): +*> Test SGEHRD, SORGHR, SHSEQR, STREVC, SHSEIN, and SORMHR +*> +*> SEP (Symmetric Eigenvalue Problem): +*> Test SSYTRD, SORGTR, SSTEQR, SSTERF, SSTEIN, SSTEDC, +*> and drivers SSYEV(X), SSBEV(X), SSPEV(X), SSTEV(X), +*> SSYEVD, SSBEVD, SSPEVD, SSTEVD +*> +*> SVD (Singular Value Decomposition): +*> Test SGEBRD, SORGBR, SBDSQR, SBDSDC +*> and the drivers SGESVD, SGESDD +*> +*> SEV (Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test SGEEV +*> +*> SES (Nonsymmetric Schur form Driver): +*> Test SGEES +*> +*> SVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test SGEEVX +*> +*> SSX (Nonsymmetric Schur form Expert Driver): +*> Test SGEESX +*> +*> SGG (Generalized Nonsymmetric Eigenvalue Problem): +*> Test SGGHD3, SGGBAL, SGGBAK, SHGEQZ, and STGEVC +*> +*> SGS (Generalized Nonsymmetric Schur form Driver): +*> Test SGGES +*> +*> SGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test SGGEV +*> +*> SGX (Generalized Nonsymmetric Schur form Expert Driver): +*> Test SGGESX +*> +*> SXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test SGGEVX +*> +*> SSG (Symmetric Generalized Eigenvalue Problem): +*> Test SSYGST, SSYGV, SSYGVD, SSYGVX, SSPGST, SSPGV, SSPGVD, +*> SSPGVX, SSBGST, SSBGV, SSBGVD, and SSBGVX +*> +*> SSB (Symmetric Band Eigenvalue Problem): +*> Test SSBTRD +*> +*> SBB (Band Singular Value Decomposition): +*> Test SGBBRD +*> +*> SEC (Eigencondition estimation): +*> Test SLALN2, SLASY2, SLAEQU, SLAEXC, STRSYL, STREXC, STRSNA, +*> STRSEN, and SLAQTR +*> +*> SBL (Balancing a general matrix) +*> Test SGEBAL +*> +*> SBK (Back transformation on a balanced matrix) +*> Test SGEBAK +*> +*> SGL (Balancing a matrix pair) +*> Test SGGBAL +*> +*> SGK (Back transformation on a matrix pair) +*> Test SGGBAK +*> +*> GLM (Generalized Linear Regression Model): +*> Tests SGGGLM +*> +*> GQR (Generalized QR and RQ factorizations): +*> Tests SGGQRF and SGGRQF +*> +*> GSV (Generalized Singular Value Decomposition): +*> Tests SGGSVD, SGGSVP, STGSJA, SLAGS2, SLAPLL, and SLAPMT +*> +*> CSD (CS decomposition): +*> Tests SORCSD +*> +*> LSE (Constrained Linear Least Squares): +*> Tests SGGLSE +*> +*> Each test path has a different set of inputs, but the data sets for +*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a +*> single input file. The first line of input should contain one of the +*> 3-character path names in columns 1-3. The number of remaining lines +*> depends on what is found on the first line. +*> +*> The number of matrix types used in testing is often controllable from +*> the input file. The number of matrix types for each path, and the +*> test routine that describes them, is as follows: +*> +*> Path name(s) Types Test routine +*> +*> SHS or NEP 21 SCHKHS +*> SST or SEP 21 SCHKST (routines) +*> 18 SDRVST (drivers) +*> SBD or SVD 16 SCHKBD (routines) +*> 5 SDRVBD (drivers) +*> SEV 21 SDRVEV +*> SES 21 SDRVES +*> SVX 21 SDRVVX +*> SSX 21 SDRVSX +*> SGG 26 SCHKGG (routines) +*> SGS 26 SDRGES +*> SGX 5 SDRGSX +*> SGV 26 SDRGEV +*> SXV 2 SDRGVX +*> SSG 21 SDRVSG +*> SSB 15 SCHKSB +*> SBB 15 SCHKBB +*> SEC - SCHKEC +*> SBL - SCHKBL +*> SBK - SCHKBK +*> SGL - SCHKGL +*> SGK - SCHKGK +*> GLM 8 SCKGLM +*> GQR 8 SCKGQR +*> GSV 8 SCKGSV +*> CSD 3 SCKCSD +*> LSE 8 SCKLSE +*> +*>----------------------------------------------------------------------- +*> +*> NEP input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NX, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: INMIN, INTEGER array, dimension (NPARMS) +*> LAHQR vs TTQRE crossover point, >= 11 +*> +*> line 9: INWIN, INTEGER array, dimension (NPARMS) +*> recommended deflation window size +*> +*> line 10: INIBL, INTEGER array, dimension (NPARMS) +*> nibble crossover point +*> +*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) +*> number of simultaneous shifts) +*> +*> line 12: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 0, 1 or 2) +*> +*> line 13: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. To have all of the test +*> ratios printed, use THRESH = 0.0 . +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow +*> the user to specify the matrix types. Each line contains +*> a 3-character path name in columns 1-3, and the number +*> of matrix types must be the first nonblank item in columns +*> 4-80. If the number of matrix types is at least 1 but is +*> less than the maximum number of possible types, a second +*> line will be read to get the numbers of the matrix types to +*> be used. For example, +*> NEP 21 +*> requests all of the matrix types for the nonsymmetric +*> eigenvalue problem, while +*> NEP 4 +*> 9 10 11 12 +*> requests only matrices of type 9, 10, 11, and 12. +*> +*> The valid 3-character path names are 'NEP' or 'SHS' for the +*> nonsymmetric eigenvalue routines. +*> +*>----------------------------------------------------------------------- +*> +*> SEP or SSG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, and NX. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 10: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 11: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 12: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 12 was 2: +*> +*> line 13: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 13-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SEP' or 'SST' for the +*> symmetric eigenvalue routines and driver routines, and +*> 'SSG' for the routines for the symmetric generalized +*> eigenvalue problem. +*> +*>----------------------------------------------------------------------- +*> +*> SVD input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 5: NPARMS, INTEGER +*> Number of values of the parameter NB, NBMIN, NX, and NRHS. +*> +*> line 6: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 7: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 8: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 9: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 10: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 11: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 12: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 13: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SVD' or 'SBD' for both the +*> SVD routines and the SVD driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> SEV and SES data files: +*> +*> line 1: 'SEV' or 'SES' in columns 1 to 3. +*> +*> line 2: NSIZES, INTEGER +*> Number of sizes of matrices to use. Should be at least 0 +*> and at most 20. If NSIZES = 0, no testing is done +*> (although the remaining 3 lines are still read). +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHSEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9 and following: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'SEV' to test SGEEV, or +*> 'SES' to test SGEES. +*> +*>----------------------------------------------------------------------- +*> +*> The SVX data has two parts. The first part is identical to SEV, +*> and the second part consists of test matrices with precomputed +*> solutions. +*> +*> line 1: 'SVX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: TSTERR, LOGICAL +*> +*> line 7: NEWSD, INTEGER +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> +*> lines 9 and following: The first line contains 'SVX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 1+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N lines contain the matrix, one +*> row per line. The last N lines correspond to each +*> eigenvalue. Each of these last N lines contains 4 real +*> values: the real part of the eigenvalue, the imaginary +*> part of the eigenvalue, the reciprocal condition number of +*> the eigenvalues, and the reciprocal condition number of the +*> eigenvector. The end of data is indicated by dimension N=0. +*> Even if no data is to be tested, there must be at least one +*> line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> The SSX data is like SVX. The first part is identical to SEV, and the +*> second part consists of test matrices with precomputed solutions. +*> +*> line 1: 'SSX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: TSTERR, LOGICAL +*> +*> line 7: NEWSD, INTEGER +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> +*> lines 9 and following: The first line contains 'SSX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 3+N lines, where N is its +*> dimension. The first line contains the dimension N and the +*> dimension M of an invariant subspace. The second line +*> contains M integers, identifying the eigenvalues in the +*> invariant subspace (by their position in a list of +*> eigenvalues ordered by increasing real part). The next N +*> lines contain the matrix. The last line contains the +*> reciprocal condition number for the average of the selected +*> eigenvalues, and the reciprocal condition number for the +*> corresponding right invariant subspace. The end of data is +*> indicated by a line containing N=0 and M=0. Even if no data +*> is to be tested, there must be at least one line containing +*> N=0 and M=0. +*> +*>----------------------------------------------------------------------- +*> +*> SGG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NS, MAXB, and +*> NBCOL. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for NBMIN, the minimum row dimension for blocks. +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of shifts. +*> +*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) +*> The values for MAXB, used in determining minimum blocksize. +*> +*> line 9: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 1 or 2) +*> +*> line 10: NBCOL, INTEGER array, dimension (NPARMS) +*> The values for NBCOL, the minimum column dimension for +*> blocks. +*> +*> line 11: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 12: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 13: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 14: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 15: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 15 was 2: +*> +*> line 16: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 17-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'SGG' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> SGS and SGV input files: +*> +*> line 1: 'SGS' or 'SGV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension(NN) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 17 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 7-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'SGS' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> SXV input files: +*> +*> line 1: 'SXV' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N lines contain the matrix A, one +*> row per line. The next N lines contain the matrix B. The +*> next line contains the reciprocals of the eigenvalue +*> condition numbers. The last line contains the reciprocals of +*> the eigenvector condition numbers. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> SGX input files: +*> +*> line 1: 'SGX' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next line contains an integer k such +*> that only the last k eigenvalues will be selected and appear +*> in the leading diagonal blocks of $A$ and $B$. The next N +*> lines contain the matrix A, one row per line. The next N +*> lines contain the matrix B. The last line contains the +*> reciprocal of the eigenvalue cluster condition number and the +*> reciprocal of the deflating subspace (associated with the +*> selected eigencluster) condition number. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> SSB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix dimension K. +*> +*> line 6: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'SSB'. +*> +*>----------------------------------------------------------------------- +*> +*> SBB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix bandwidth K. +*> +*> line 6: NPARMS, INTEGER +*> Number of values of the parameter NRHS +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 9 was 2: +*> +*> line 10: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 10-EOF: Lines specifying matrix types, as for SVD. +*> The 3-character path name is 'SBB'. +*> +*>----------------------------------------------------------------------- +*> +*> SEC input file: +*> +*> line 2: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> lines 3-EOF: +*> +*> Input for testing the eigencondition routines consists of a set of +*> specially constructed test cases and their solutions. The data +*> format is not intended to be modified by the user. +*> +*>----------------------------------------------------------------------- +*> +*> SBL and SBK input files: +*> +*> line 1: 'SBL' in columns 1-3 to test SGEBAL, or 'SBK' in +*> columns 1-3 to test SGEBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> SGL and SGK input files: +*> +*> line 1: 'SGL' in columns 1-3 to test SGGBAL, or 'SGK' in +*> columns 1-3 to test SGGBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> GLM data file: +*> +*> line 1: 'GLM' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension), note M <= N <= M+P. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GLM' for the generalized +*> linear regression model routines. +*> +*>----------------------------------------------------------------------- +*> +*> GQR data file: +*> +*> line 1: 'GQR' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GQR' for the generalized +*> QR and RQ routines. +*> +*>----------------------------------------------------------------------- +*> +*> GSV data file: +*> +*> line 1: 'GSV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> CSD data file: +*> +*> line 1: 'CSD' in columns 1 to 3. +*> +*> line 2: NM, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NM) +*> Values of M (row and column dimension of orthogonal matrix). +*> +*> line 4: PVAL, INTEGER array, dimension(NM) +*> Values of P (row dimension of top-left block). +*> +*> line 5: NVAL, INTEGER array, dimension(NM) +*> Values of N (column dimension of top-left block). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CSD' for the CSD routine. +*> +*>----------------------------------------------------------------------- +*> +*> LSE data file: +*> +*> line 1: 'LSE' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N, note P <= N <= P+M. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> NMAX is currently set to 132 and must be at least 12 for some of the +*> precomputed examples, and LWORK = NMAX*(5*NMAX+5)+1 in the parameter +*> statements below. For SVD, we assume NRHS may be as big as N. The +*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for SGG. +*> \endverbatim +* +* Arguments: +* ========== +* +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup single_eig +* +* ===================================================================== + PROGRAM SCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER NCMAX + PARAMETER ( NCMAX = 20 ) + INTEGER NEED + PARAMETER ( NEED = 14 ) + INTEGER LWORK + PARAMETER ( LWORK = NMAX*( 5*NMAX+5 )+1 ) + INTEGER LIWORK + PARAMETER ( LIWORK = NMAX*( 5*NMAX+20 ) ) + INTEGER MAXIN + PARAMETER ( MAXIN = 20 ) + INTEGER MAXT + PARAMETER ( MAXT = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) +* .. +* .. Local Scalars .. + LOGICAL CSD, FATAL, GLM, GQR, GSV, LSE, NEP, SBB, SBK, + $ SBL, SEP, SES, SEV, SGG, SGK, SGL, SGS, SGV, + $ SGX, SSB, SSX, SVD, SVX, SXV, TSTCHK, TSTDIF, + $ TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*3 C3, PATH + CHARACTER*32 VNAME + CHARACTER*10 INTSTR + CHARACTER*80 LINE + INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, + $ NK, NN, NPARMS, NRHS, NTYPES, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + REAL EPS, S1, S2, THRESH, THRSHN +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) + INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), + $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), + $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ PVAL( MAXIN ) + INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), + $ ISHFTS( MAXIN ), IACC22( MAXIN ) + REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: WORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. +* .. External Functions .. + LOGICAL LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, SCHKBB, SCHKBD, SCHKBK, SCHKBL, SCHKEC, + $ SCHKGG, SCHKGK, SCHKGL, SCHKHS, SCHKSB, SCHKST, + $ SCKCSD, SCKGLM, SCKGQR, SCKGSV, SCKLSE, SDRGES, + $ SDRGEV, SDRGSX, SDRGVX, SDRVBD, SDRVES, SDRVEV, + $ SDRVSG, SDRVST, SDRVSX, SDRVVX, SERRBD, + $ SERRED, SERRGG, SERRHS, SERRST, ILAVER, XLAENV, + $ SDRGES3, SDRGEV3, + $ SCHKST2STG, SDRVST2STG, SCHKSB2STG, SDRVSG2STG +* .. +* .. Intrinsic Functions .. + INTRINSIC LEN, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, + $ SELOPT +* .. +* .. Arrays in Common .. + LOGICAL SELVAL( 20 ) + INTEGER IPARMS( 100 ) + REAL SELWI( 20 ), SELWR( 20 ) +* .. +* .. Common blocks .. + COMMON / CENVIR / NPROC, NSHIFT, MAXB + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI +* .. +* .. Data statements .. + DATA INTSTR / '0123456789' / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + A = 0.0 + B = 0.0 + C = 0.0 + D = 0.0 + S1 = SECOND( ) + FATAL = .FALSE. + NUNIT = NOUT +* +* Return to here to read multiple sets of data +* + 10 CONTINUE +* +* Read the first line and set the 3-character test path +* + READ( NIN, FMT = '(A80)', END = 380 )LINE + PATH = LINE( 1: 3 ) + NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'SHS' ) + SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'SST' ) .OR. + $ LSAMEN( 3, PATH, 'SSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) + SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'DBD' ) + SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'SBD' ) + SEV = LSAMEN( 3, PATH, 'SEV' ) + SES = LSAMEN( 3, PATH, 'SES' ) + SVX = LSAMEN( 3, PATH, 'SVX' ) + SSX = LSAMEN( 3, PATH, 'SSX' ) + SGG = LSAMEN( 3, PATH, 'SGG' ) + SGS = LSAMEN( 3, PATH, 'SGS' ) + SGX = LSAMEN( 3, PATH, 'SGX' ) + SGV = LSAMEN( 3, PATH, 'SGV' ) + SXV = LSAMEN( 3, PATH, 'SXV' ) + SSB = LSAMEN( 3, PATH, 'SSB' ) + SBB = LSAMEN( 3, PATH, 'SBB' ) + GLM = LSAMEN( 3, PATH, 'GLM' ) + GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) + GSV = LSAMEN( 3, PATH, 'GSV' ) + CSD = LSAMEN( 3, PATH, 'CSD' ) + LSE = LSAMEN( 3, PATH, 'LSE' ) + SBL = LSAMEN( 3, PATH, 'SBL' ) + SBK = LSAMEN( 3, PATH, 'SBK' ) + SGL = LSAMEN( 3, PATH, 'SGL' ) + SGK = LSAMEN( 3, PATH, 'SGK' ) +* +* Report values of parameters. +* + IF( PATH.EQ.' ' ) THEN + GO TO 10 + ELSE IF( NEP ) THEN + WRITE( NOUT, FMT = 9987 ) + ELSE IF( SEP ) THEN + WRITE( NOUT, FMT = 9986 ) + ELSE IF( SVD ) THEN + WRITE( NOUT, FMT = 9985 ) + ELSE IF( SEV ) THEN + WRITE( NOUT, FMT = 9979 ) + ELSE IF( SES ) THEN + WRITE( NOUT, FMT = 9978 ) + ELSE IF( SVX ) THEN + WRITE( NOUT, FMT = 9977 ) + ELSE IF( SSX ) THEN + WRITE( NOUT, FMT = 9976 ) + ELSE IF( SGG ) THEN + WRITE( NOUT, FMT = 9975 ) + ELSE IF( SGS ) THEN + WRITE( NOUT, FMT = 9964 ) + ELSE IF( SGX ) THEN + WRITE( NOUT, FMT = 9965 ) + ELSE IF( SGV ) THEN + WRITE( NOUT, FMT = 9963 ) + ELSE IF( SXV ) THEN + WRITE( NOUT, FMT = 9962 ) + ELSE IF( SSB ) THEN + WRITE( NOUT, FMT = 9974 ) + ELSE IF( SBB ) THEN + WRITE( NOUT, FMT = 9967 ) + ELSE IF( GLM ) THEN + WRITE( NOUT, FMT = 9971 ) + ELSE IF( GQR ) THEN + WRITE( NOUT, FMT = 9970 ) + ELSE IF( GSV ) THEN + WRITE( NOUT, FMT = 9969 ) + ELSE IF( CSD ) THEN + WRITE( NOUT, FMT = 9960 ) + ELSE IF( LSE ) THEN + WRITE( NOUT, FMT = 9968 ) + ELSE IF( SBL ) THEN +* +* SGEBAL: Balancing +* + CALL SCHKBL( NIN, NOUT ) + GO TO 10 + ELSE IF( SBK ) THEN +* +* SGEBAK: Back transformation +* + CALL SCHKBK( NIN, NOUT ) + GO TO 10 + ELSE IF( SGL ) THEN +* +* SGGBAL: Balancing +* + CALL SCHKGL( NIN, NOUT ) + GO TO 10 + ELSE IF( SGK ) THEN +* +* SGGBAK: Back transformation +* + CALL SCHKGK( NIN, NOUT ) + GO TO 10 + ELSE IF( LSAMEN( 3, PATH, 'SEC' ) ) THEN +* +* SEC: Eigencondition estimation +* + READ( NIN, FMT = * )THRESH + CALL XLAENV( 1, 1 ) + CALL XLAENV( 12, 11 ) + CALL XLAENV( 13, 2 ) + CALL XLAENV( 14, 0 ) + CALL XLAENV( 15, 2 ) + CALL XLAENV( 16, 2 ) + TSTERR = .TRUE. + CALL SCHKEC( THRESH, TSTERR, NIN, NOUT ) + GO TO 10 + ELSE + WRITE( NOUT, FMT = 9992 )PATH + GO TO 10 + END IF + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH + WRITE( NOUT, FMT = 9984 ) +* +* Read the number of values of M, P, and N. +* + READ( NIN, FMT = * )NN + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF +* +* Read the values of M +* + IF( .NOT.( SGX .OR. SXV ) ) THEN + READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) + IF( SVD ) THEN + VNAME = ' M ' + ELSE + VNAME = ' N ' + END IF + DO 20 I = 1, NN + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) + END IF +* +* Read the values of P +* + IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN + READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) + DO 30 I = 1, NN + IF( PVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( PVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX + FATAL = .TRUE. + END IF + 30 CONTINUE + WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) + END IF +* +* Read the values of N +* + IF( SVD .OR. SBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. + $ LSE ) THEN + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 40 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 40 CONTINUE + ELSE + DO 50 I = 1, NN + NVAL( I ) = MVAL( I ) + 50 CONTINUE + END IF + IF( .NOT.( SGX .OR. SXV ) ) THEN + WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) + ELSE + WRITE( NOUT, FMT = 9983 )'N: ', NN + END IF +* +* Read the number of values of K, followed by the values of K +* + IF( SSB .OR. SBB ) THEN + READ( NIN, FMT = * )NK + READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) + DO 60 I = 1, NK + IF( KVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( KVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX + FATAL = .TRUE. + END IF + 60 CONTINUE + WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) + END IF +* + IF( SEV .OR. SES .OR. SVX .OR. SSX ) THEN +* +* For the nonsymmetric QR driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INWIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INIBL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( ISHFTS( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 + FATAL = .TRUE. + ELSE IF( IACC22( 1 ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) + CALL XLAENV(13, INWIN( 1 ) ) + CALL XLAENV(14, INIBL( 1 ) ) + CALL XLAENV(15, ISHFTS( 1 ) ) + CALL XLAENV(16, IACC22( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) + WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) +* + ELSE IF( SGS .OR. SGX .OR. SGV .OR. SXV ) THEN +* +* For the nonsymmetric generalized driver routines, only one set +* of parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ NSVAL( 1 ), MXBVAL( 1 ) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NSVAL( 1 ).LT.2 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 + FATAL = .TRUE. + ELSE IF( MXBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV( 4, NSVAL( 1 ) ) + CALL XLAENV( 8, MXBVAL( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) +* + ELSE IF( .NOT.SSB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. + $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN +* +* For the other paths, the number of parameters can be varied +* from the input file. Read the number of parameter values. +* + READ( NIN, FMT = * )NPARMS + IF( NPARMS.LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 + NPARMS = 0 + FATAL = .TRUE. + ELSE IF( NPARMS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN + NPARMS = 0 + FATAL = .TRUE. + END IF +* +* Read the values of NB +* + IF( .NOT.SBB ) THEN + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) + DO 70 I = 1, NPARMS + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 70 CONTINUE + WRITE( NOUT, FMT = 9983 )'NB: ', + $ ( NBVAL( I ), I = 1, NPARMS ) + END IF +* +* Read the values of NBMIN +* + IF( NEP .OR. SEP .OR. SVD .OR. SGG ) THEN + READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) + DO 80 I = 1, NPARMS + IF( NBMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 + FATAL = .TRUE. + ELSE IF( NBMIN( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX + FATAL = .TRUE. + END IF + 80 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBMIN:', + $ ( NBMIN( I ), I = 1, NPARMS ) + ELSE + DO 90 I = 1, NPARMS + NBMIN( I ) = 1 + 90 CONTINUE + END IF +* +* Read the values of NX +* + IF( NEP .OR. SEP .OR. SVD ) THEN + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) + DO 100 I = 1, NPARMS + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NXVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX + FATAL = .TRUE. + END IF + 100 CONTINUE + WRITE( NOUT, FMT = 9983 )'NX: ', + $ ( NXVAL( I ), I = 1, NPARMS ) + ELSE + DO 110 I = 1, NPARMS + NXVAL( I ) = 1 + 110 CONTINUE + END IF +* +* Read the values of NSHIFT (if SGG) or NRHS (if SVD +* or SBB). +* + IF( SVD .OR. SBB .OR. SGG ) THEN + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) + DO 120 I = 1, NPARMS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX + FATAL = .TRUE. + END IF + 120 CONTINUE + WRITE( NOUT, FMT = 9983 )'NS: ', + $ ( NSVAL( I ), I = 1, NPARMS ) + ELSE + DO 130 I = 1, NPARMS + NSVAL( I ) = 1 + 130 CONTINUE + END IF +* +* Read the values for MAXB. +* + IF( SGG ) THEN + READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) + DO 140 I = 1, NPARMS + IF( MXBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MXBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 140 CONTINUE + WRITE( NOUT, FMT = 9983 )'MAXB: ', + $ ( MXBVAL( I ), I = 1, NPARMS ) + ELSE + DO 150 I = 1, NPARMS + MXBVAL( I ) = 1 + 150 CONTINUE + END IF +* +* Read the values for INMIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) + DO 540 I = 1, NPARMS + IF( INMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 + FATAL = .TRUE. + END IF + 540 CONTINUE + WRITE( NOUT, FMT = 9983 )'INMIN: ', + $ ( INMIN( I ), I = 1, NPARMS ) + ELSE + DO 550 I = 1, NPARMS + INMIN( I ) = 1 + 550 CONTINUE + END IF +* +* Read the values for INWIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) + DO 560 I = 1, NPARMS + IF( INWIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 + FATAL = .TRUE. + END IF + 560 CONTINUE + WRITE( NOUT, FMT = 9983 )'INWIN: ', + $ ( INWIN( I ), I = 1, NPARMS ) + ELSE + DO 570 I = 1, NPARMS + INWIN( I ) = 1 + 570 CONTINUE + END IF +* +* Read the values for INIBL. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) + DO 580 I = 1, NPARMS + IF( INIBL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 + FATAL = .TRUE. + END IF + 580 CONTINUE + WRITE( NOUT, FMT = 9983 )'INIBL: ', + $ ( INIBL( I ), I = 1, NPARMS ) + ELSE + DO 590 I = 1, NPARMS + INIBL( I ) = 1 + 590 CONTINUE + END IF +* +* Read the values for ISHFTS. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) + DO 600 I = 1, NPARMS + IF( ISHFTS( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 + FATAL = .TRUE. + END IF + 600 CONTINUE + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', + $ ( ISHFTS( I ), I = 1, NPARMS ) + ELSE + DO 610 I = 1, NPARMS + ISHFTS( I ) = 1 + 610 CONTINUE + END IF +* +* Read the values for IACC22. +* + IF( NEP .OR. SGG ) THEN + READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) + DO 620 I = 1, NPARMS + IF( IACC22( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 + FATAL = .TRUE. + END IF + 620 CONTINUE + WRITE( NOUT, FMT = 9983 )'IACC22: ', + $ ( IACC22( I ), I = 1, NPARMS ) + ELSE + DO 630 I = 1, NPARMS + IACC22( I ) = 1 + 630 CONTINUE + END IF +* +* Read the values for NBCOL. +* + IF( SGG ) THEN + READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) + DO 160 I = 1, NPARMS + IF( NBCOL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBCOL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX + FATAL = .TRUE. + END IF + 160 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBCOL:', + $ ( NBCOL( I ), I = 1, NPARMS ) + ELSE + DO 170 I = 1, NPARMS + NBCOL( I ) = 1 + 170 CONTINUE + END IF + END IF +* +* Calculate and print the machine dependent constants. +* + WRITE( NOUT, FMT = * ) + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9981 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9981 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9981 )'precision', EPS +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9982 )THRESH + IF( SEP .OR. SVD .OR. SGG ) THEN +* +* Read the flag that indicates whether to test LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test driver routines. +* + READ( NIN, FMT = * )TSTDRV + END IF +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* +* Read the code describing how to set the random number seed. +* + READ( NIN, FMT = * )NEWSD +* +* If NEWSD = 2, read another line with 4 integers for the seed. +* + IF( NEWSD.EQ.2 ) + $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) +* + DO 180 I = 1, 4 + ISEED( I ) = IOLDSD( I ) + 180 CONTINUE +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Read the input lines indicating the test path and its parameters. +* The first three characters indicate the test path, and the number +* of test matrix types must be the first nonblank item in columns +* 4-80. +* + 190 CONTINUE +* + IF( .NOT.( SGX .OR. SXV ) ) THEN +* + 200 CONTINUE + READ( NIN, FMT = '(A80)', END = 380 )LINE + C3 = LINE( 1: 3 ) + LENP = LEN( LINE ) + I = 3 + ITMP = 0 + I1 = 0 + 210 CONTINUE + I = I + 1 + IF( I.GT.LENP ) THEN + IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + NTYPES = MAXT + GO TO 240 + END IF + END IF + IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN + I1 = I + C1 = LINE( I1: I1 ) +* +* Check that a valid integer was read +* + DO 220 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 230 + END IF + 220 CONTINUE + WRITE( NOUT, FMT = 9991 )I, LINE + GO TO 200 + 230 CONTINUE + ITMP = 10*ITMP + IC + GO TO 210 + ELSE IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + GO TO 210 + END IF + 240 CONTINUE + NTYPES = ITMP +* +* Skip the tests if NTYPES is <= 0. +* + IF( .NOT.( SEV .OR. SES .OR. SVX .OR. SSX .OR. SGV .OR. + $ SGS ) .AND. NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + GO TO 200 + END IF +* + ELSE + IF( SXV ) + $ C3 = 'SXV' + IF( SGX ) + $ C3 = 'SGX' + END IF +* +* Reset the random number seed. +* + IF( NEWSD.EQ.0 ) THEN + DO 250 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 250 CONTINUE + END IF +* + IF( LSAMEN( 3, C3, 'SHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN +* +* ------------------------------------- +* NEP: Nonsymmetric Eigenvalue Problem +* ------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NS = number of shifts +* MAXB = minimum submatrix size +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL SERRHS( 'SHSEQR', NOUT ) + DO 270 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) + CALL XLAENV(13, INWIN( I ) ) + CALL XLAENV(14, INIBL( I ) ) + CALL XLAENV(15, ISHFTS( I ) ) + CALL XLAENV(16, IACC22( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 260 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 260 CONTINUE + END IF + WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), MAX( 11, INMIN(I)), + $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) + CALL SCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), + $ A( 1, 7 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), A( 1, 8 ), + $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ D( 1, 7 ), WORK, LWORK, IWORK, LOGWRK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKHS', INFO + 270 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'SST' ) .OR. LSAMEN( 3, C3, 'SEP' ) + $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN +* +* ---------------------------------- +* SEP: Symmetric Eigenvalue Problem +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL SERRST( 'SST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF + DO 290 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 280 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 280 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL SCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), + $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL SCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), + $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKST', INFO + END IF + IF( TSTDRV ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL SDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11), A( 1, 2 ), NMAX, + $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL SDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11), A( 1, 2 ), NMAX, + $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRVST', INFO + END IF + 290 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'SSG' ) ) THEN +* +* ---------------------------------------------- +* SSG: Symmetric Generalized Eigenvalue Problem +* ---------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) + DO 310 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 300 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 300 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN +* CALL SDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, +* $ D( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), +* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, +* $ LWORK, IWORK, LIWORK, RESULT, INFO ) + CALL SDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ D( 1, 3 ), D( 1, 3 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), + $ A( 1, 7 ), WORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRVSG', INFO + END IF + 310 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'SBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN +* +* ---------------------------------- +* SVD: Singular Value Decomposition +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NRHS = number of right hand sides +* + MAXTYP = 16 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) +* +* Test the error exits +* + IF( TSTERR .AND. TSTCHK ) + $ CALL SERRBD( 'SBD', NOUT ) + IF( TSTERR .AND. TSTDRV ) + $ CALL SERRED( 'SBD', NOUT ) +* + DO 330 I = 1, NPARMS + NRHS = NSVAL( I ) + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + IF( NEWSD.EQ.0 ) THEN + DO 320 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 320 CONTINUE + END IF + WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), NRHS + IF( TSTCHK ) THEN + CALL SCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, + $ THRESH, A( 1, 1 ), NMAX, D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 2 ), + $ NMAX, A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), NMAX, + $ A( 1, 6 ), NMAX, A( 1, 7 ), A( 1, 8 ), WORK, + $ LWORK, IWORK, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKBD', INFO + END IF + IF( TSTDRV ) + $ CALL SDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ WORK, LWORK, IWORK, NOUT, INFO ) + 330 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'SEV' ) ) THEN +* +* -------------------------------------------- +* SEV: Nonsymmetric Eigenvalue Problem Driver +* SGEEV (eigenvalues and eigenvectors) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), + $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, RESULT, + $ WORK, LWORK, IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SGEEV', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SES' ) ) THEN +* +* -------------------------------------------- +* SES: Nonsymmetric Eigenvalue Problem Driver +* SGEES (Schur form) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ A( 1, 4 ), NMAX, RESULT, WORK, LWORK, IWORK, + $ LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SGEES', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SVX' ) ) THEN +* +* -------------------------------------------------------------- +* SVX: Nonsymmetric Eigenvalue Problem Expert Driver +* SGEEVX (eigenvalues, eigenvectors and condition numbers) +* -------------------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), + $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, + $ D( 1, 5 ), D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), + $ D( 1, 9 ), D( 1, 10 ), D( 1, 11 ), D( 1, 12 ), + $ RESULT, WORK, LWORK, IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SGEEVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SSX' ) ) THEN +* +* --------------------------------------------------- +* SSX: Nonsymmetric Eigenvalue Problem Expert Driver +* SGEESX (Schur form and condition numbers) +* --------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), A( 1, 4 ), NMAX, + $ A( 1, 5 ), RESULT, WORK, LWORK, IWORK, LOGWRK, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SGEESX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SGG' ) ) THEN +* +* ------------------------------------------------- +* SGG: Generalized Nonsymmetric Eigenvalue Problem +* ------------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NS = number of shifts +* MAXB = minimum submatrix size +* IACC22: structured matrix multiply +* NBCOL = minimum column dimension for blocks +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV(1,1) + IF( TSTCHK .AND. TSTERR ) + & CALL SERRGG( C3, NOUT ) + DO 350 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 4, NSVAL( I ) ) + CALL XLAENV( 8, MXBVAL( I ) ) + CALL XLAENV( 16, IACC22( I ) ) + CALL XLAENV( 5, NBCOL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 340 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 340 CONTINUE + END IF + WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), + $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) + TSTDIF = .FALSE. + THRSHN = 10. + IF( TSTCHK ) THEN + CALL SCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), + $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), A( 1, 13 ), + $ A( 1, 14 ), WORK, LWORK, LOGWRK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKGG', INFO + END IF + 350 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'SGS' ) ) THEN +* +* ------------------------------------------------- +* SGS: Generalized Nonsymmetric Eigenvalue Problem +* SGGES (Schur form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGES', INFO +* +* Blocked version +* + CALL XLAENV(16,1) + CALL SDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGES3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( SGX ) THEN +* +* ------------------------------------------------- +* SGX: Generalized Nonsymmetric Eigenvalue Problem +* SGGESX (Schur form and condition numbers) +* ------------------------------------------------- +* + MAXTYP = 5 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 5, 2 ) + CALL SDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ C( 1, 1 ), NCMAX*NCMAX, A( 1, 12 ), WORK, + $ LWORK, IWORK, LIWORK, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGSX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SGV' ) ) THEN +* +* ------------------------------------------------- +* SGV: Generalized Nonsymmetric Eigenvalue Problem +* SGGEV (Eigenvalue/vector form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), + $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), + $ WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGEV', INFO +* +* Blocked version +* + CALL SDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), + $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), + $ WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGEV3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( SXV ) THEN +* +* ------------------------------------------------- +* SXV: Generalized Nonsymmetric Eigenvalue Problem +* SGGEVX (eigenvalue/vector with condition numbers) +* ------------------------------------------------- +* + MAXTYP = 2 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL SERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL SDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), A( 1, 5 ), A( 1, 6 ), + $ IWORK( 1 ), IWORK( 2 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ WORK, LWORK, IWORK( 3 ), LIWORK-2, RESULT, + $ LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SDRGVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'SSB' ) ) THEN +* +* ------------------------------ +* SSB: Symmetric Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + IF( TSTERR ) + $ CALL SERRST( 'SSB', NOUT ) +* CALL SCHKSB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), D( 1, 2 ), +* $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) + CALL SCHKSB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKSB', INFO +* + ELSE IF( LSAMEN( 3, C3, 'SBB' ) ) THEN +* +* ------------------------------ +* SBB: General Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + DO 370 I = 1, NPARMS + NRHS = NSVAL( I ) +* + IF( NEWSD.EQ.0 ) THEN + DO 360 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 360 CONTINUE + END IF + WRITE( NOUT, FMT = 9966 )C3, NRHS + CALL SCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, + $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), 2*NMAX, D( 1, 1 ), D( 1, 2 ), + $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), + $ NMAX, A( 1, 7 ), WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCHKBB', INFO + 370 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN +* +* ----------------------------------------- +* GLM: Generalized Linear Regression Model +* ----------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL SERRGG( 'GLM', NOUT ) + CALL SCKGLM( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCKGLM', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN +* +* ------------------------------------------ +* GQR: Generalized QR and RQ factorizations +* ------------------------------------------ +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL SERRGG( 'GQR', NOUT ) + CALL SCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, + $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, D( 1, 1 ), NIN, + $ NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCKGQR', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN +* +* ---------------------------------------------- +* GSV: Generalized Singular Value Decomposition +* ---------------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL SERRGG( 'GSV', NOUT ) + CALL SCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), TAUA, TAUB, + $ B( 1, 4 ), IWORK, WORK, D( 1, 1 ), NIN, NOUT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCKGSV', INFO +* + ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN +* +* ---------------------------------------------- +* CSD: CS Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL SERRGG( 'CSD', NOUT ) + CALL SCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), IWORK, WORK, + $ D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCKCSD', INFO +* + ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN +* +* -------------------------------------- +* LSE: Constrained Linear Least Squares +* -------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL SERRGG( 'LSE', NOUT ) + CALL SCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'SCKLSE', INFO +* + ELSE + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9992 )C3 + END IF + IF( .NOT.( SGX .OR. SXV ) ) + $ GO TO 190 + 380 CONTINUE + WRITE( NOUT, FMT = 9994 ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) + 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, + $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) + 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', NRHS =', I4 ) + 9994 FORMAT( / / ' End of tests' ) + 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) + 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, + $ ' of input', ' line:', / A79 ) + 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) + 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', + $ I6 ) + 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', + $ I6 ) + 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) + 9986 FORMAT( ' Tests of the Symmetric Eigenvalue Problem routines' ) + 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) + 9984 FORMAT( / ' The following parameter values will be used:' ) + 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) + 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9981 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) + 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' SGEEV (eigenvalues and eigevectors)' ) + 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' SGEES (Schur form)' ) + 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' SGEEVX (eigenvalues, eigenvectors and', + $ ' condition numbers)' ) + 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' SGEESX (Schur form and condition', + $ ' numbers)' ) + 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem routines' ) + 9974 FORMAT( ' Tests of SSBTRD', / ' (reduction of a symmetric band ', + $ 'matrix to tridiagonal form)' ) + 9973 FORMAT( / 1X, 71( '-' ) ) + 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) + 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', + $ 'routines' ) + 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) + 9969 FORMAT( / ' Tests of the Generalized Singular Value', + $ ' Decomposition routines' ) + 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) + 9967 FORMAT( ' Tests of SGBBRD', / ' (reduction of a general band ', + $ 'matrix to real bidiagonal form)' ) + 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) + 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver SGGESX' ) + 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver SGGES' ) + 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver SGGEV' ) + 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver SGGEVX' ) + 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', INMIN=', I4, + $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, + $ ', IACC22 =', I4) + 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) +* +* End of SCHKEE +* + END From 9b7b1da133a6c9c6d77d36dc37247044551ccd75 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:50:26 +0100 Subject: [PATCH 128/681] Add rewritten dchkee.F from Reference-LAPACK PR335 --- lapack-netlib/TESTING/EIG/dchkee.F | 2538 ++++++++++++++++++++++++++++ 1 file changed, 2538 insertions(+) create mode 100644 lapack-netlib/TESTING/EIG/dchkee.F diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F new file mode 100644 index 000000000..ee22ce33d --- /dev/null +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -0,0 +1,2538 @@ +*> \brief \b DCHKEE +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DCHKEE +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKEE tests the DOUBLE PRECISION LAPACK subroutines for the matrix +*> eigenvalue problem. The test paths in this version are +*> +*> NEP (Nonsymmetric Eigenvalue Problem): +*> Test DGEHRD, DORGHR, DHSEQR, DTREVC, DHSEIN, and DORMHR +*> +*> SEP (Symmetric Eigenvalue Problem): +*> Test DSYTRD, DORGTR, DSTEQR, DSTERF, DSTEIN, DSTEDC, +*> and drivers DSYEV(X), DSBEV(X), DSPEV(X), DSTEV(X), +*> DSYEVD, DSBEVD, DSPEVD, DSTEVD +*> +*> SVD (Singular Value Decomposition): +*> Test DGEBRD, DORGBR, DBDSQR, DBDSDC +*> and the drivers DGESVD, DGESDD +*> +*> DEV (Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test DGEEV +*> +*> DES (Nonsymmetric Schur form Driver): +*> Test DGEES +*> +*> DVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test DGEEVX +*> +*> DSX (Nonsymmetric Schur form Expert Driver): +*> Test DGEESX +*> +*> DGG (Generalized Nonsymmetric Eigenvalue Problem): +*> Test DGGHD3, DGGBAL, DGGBAK, DHGEQZ, and DTGEVC +*> +*> DGS (Generalized Nonsymmetric Schur form Driver): +*> Test DGGES +*> +*> DGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test DGGEV +*> +*> DGX (Generalized Nonsymmetric Schur form Expert Driver): +*> Test DGGESX +*> +*> DXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test DGGEVX +*> +*> DSG (Symmetric Generalized Eigenvalue Problem): +*> Test DSYGST, DSYGV, DSYGVD, DSYGVX, DSPGST, DSPGV, DSPGVD, +*> DSPGVX, DSBGST, DSBGV, DSBGVD, and DSBGVX +*> +*> DSB (Symmetric Band Eigenvalue Problem): +*> Test DSBTRD +*> +*> DBB (Band Singular Value Decomposition): +*> Test DGBBRD +*> +*> DEC (Eigencondition estimation): +*> Test DLALN2, DLASY2, DLAEQU, DLAEXC, DTRSYL, DTREXC, DTRSNA, +*> DTRSEN, and DLAQTR +*> +*> DBL (Balancing a general matrix) +*> Test DGEBAL +*> +*> DBK (Back transformation on a balanced matrix) +*> Test DGEBAK +*> +*> DGL (Balancing a matrix pair) +*> Test DGGBAL +*> +*> DGK (Back transformation on a matrix pair) +*> Test DGGBAK +*> +*> GLM (Generalized Linear Regression Model): +*> Tests DGGGLM +*> +*> GQR (Generalized QR and RQ factorizations): +*> Tests DGGQRF and DGGRQF +*> +*> GSV (Generalized Singular Value Decomposition): +*> Tests DGGSVD, DGGSVP, DTGSJA, DLAGS2, DLAPLL, and DLAPMT +*> +*> CSD (CS decomposition): +*> Tests DORCSD +*> +*> LSE (Constrained Linear Least Squares): +*> Tests DGGLSE +*> +*> Each test path has a different set of inputs, but the data sets for +*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a +*> single input file. The first line of input should contain one of the +*> 3-character path names in columns 1-3. The number of remaining lines +*> depends on what is found on the first line. +*> +*> The number of matrix types used in testing is often controllable from +*> the input file. The number of matrix types for each path, and the +*> test routine that describes them, is as follows: +*> +*> Path name(s) Types Test routine +*> +*> DHS or NEP 21 DCHKHS +*> DST or SEP 21 DCHKST (routines) +*> 18 DDRVST (drivers) +*> DBD or SVD 16 DCHKBD (routines) +*> 5 DDRVBD (drivers) +*> DEV 21 DDRVEV +*> DES 21 DDRVES +*> DVX 21 DDRVVX +*> DSX 21 DDRVSX +*> DGG 26 DCHKGG (routines) +*> DGS 26 DDRGES +*> DGX 5 DDRGSX +*> DGV 26 DDRGEV +*> DXV 2 DDRGVX +*> DSG 21 DDRVSG +*> DSB 15 DCHKSB +*> DBB 15 DCHKBB +*> DEC - DCHKEC +*> DBL - DCHKBL +*> DBK - DCHKBK +*> DGL - DCHKGL +*> DGK - DCHKGK +*> GLM 8 DCKGLM +*> GQR 8 DCKGQR +*> GSV 8 DCKGSV +*> CSD 3 DCKCSD +*> LSE 8 DCKLSE +*> +*>----------------------------------------------------------------------- +*> +*> NEP input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NX, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: INMIN, INTEGER array, dimension (NPARMS) +*> LAHQR vs TTQRE crossover point, >= 11 +*> +*> line 9: INWIN, INTEGER array, dimension (NPARMS) +*> recommended deflation window size +*> +*> line 10: INIBL, INTEGER array, dimension (NPARMS) +*> nibble crossover point +*> +*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) +*> number of simultaneous shifts) +*> +*> line 12: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 0, 1 or 2) +*> +*> line 13: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. To have all of the test +*> ratios printed, use THRESH = 0.0 . +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow +*> the user to specify the matrix types. Each line contains +*> a 3-character path name in columns 1-3, and the number +*> of matrix types must be the first nonblank item in columns +*> 4-80. If the number of matrix types is at least 1 but is +*> less than the maximum number of possible types, a second +*> line will be read to get the numbers of the matrix types to +*> be used. For example, +*> NEP 21 +*> requests all of the matrix types for the nonsymmetric +*> eigenvalue problem, while +*> NEP 4 +*> 9 10 11 12 +*> requests only matrices of type 9, 10, 11, and 12. +*> +*> The valid 3-character path names are 'NEP' or 'SHS' for the +*> nonsymmetric eigenvalue routines. +*> +*>----------------------------------------------------------------------- +*> +*> SEP or DSG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, and NX. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 10: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 11: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 12: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 12 was 2: +*> +*> line 13: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 13-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SEP' or 'SST' for the +*> symmetric eigenvalue routines and driver routines, and +*> 'DSG' for the routines for the symmetric generalized +*> eigenvalue problem. +*> +*>----------------------------------------------------------------------- +*> +*> SVD input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 5: NPARMS, INTEGER +*> Number of values of the parameter NB, NBMIN, NX, and NRHS. +*> +*> line 6: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 7: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 8: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 9: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 10: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 11: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 12: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 13: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SVD' or 'SBD' for both the +*> SVD routines and the SVD driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> DEV and DES data files: +*> +*> line 1: 'DEV' or 'DES' in columns 1 to 3. +*> +*> line 2: NSIZES, INTEGER +*> Number of sizes of matrices to use. Should be at least 0 +*> and at most 20. If NSIZES = 0, no testing is done +*> (although the remaining 3 lines are still read). +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHSEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9 and following: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'DEV' to test SGEEV, or +*> 'DES' to test SGEES. +*> +*>----------------------------------------------------------------------- +*> +*> The DVX data has two parts. The first part is identical to DEV, +*> and the second part consists of test matrices with precomputed +*> solutions. +*> +*> line 1: 'DVX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: TSTERR, LOGICAL +*> +*> line 7: NEWSD, INTEGER +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> +*> lines 9 and following: The first line contains 'DVX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 1+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N lines contain the matrix, one +*> row per line. The last N lines correspond to each +*> eigenvalue. Each of these last N lines contains 4 real +*> values: the real part of the eigenvalue, the imaginary +*> part of the eigenvalue, the reciprocal condition number of +*> the eigenvalues, and the reciprocal condition number of the +*> eigenvector. The end of data is indicated by dimension N=0. +*> Even if no data is to be tested, there must be at least one +*> line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> The DSX data is like DVX. The first part is identical to DEV, and the +*> second part consists of test matrices with precomputed solutions. +*> +*> line 1: 'DSX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: TSTERR, LOGICAL +*> +*> line 7: NEWSD, INTEGER +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> +*> lines 9 and following: The first line contains 'DSX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 3+N lines, where N is its +*> dimension. The first line contains the dimension N and the +*> dimension M of an invariant subspace. The second line +*> contains M integers, identifying the eigenvalues in the +*> invariant subspace (by their position in a list of +*> eigenvalues ordered by increasing real part). The next N +*> lines contain the matrix. The last line contains the +*> reciprocal condition number for the average of the selected +*> eigenvalues, and the reciprocal condition number for the +*> corresponding right invariant subspace. The end of data is +*> indicated by a line containing N=0 and M=0. Even if no data +*> is to be tested, there must be at least one line containing +*> N=0 and M=0. +*> +*>----------------------------------------------------------------------- +*> +*> DGG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NS, MAXB, and +*> NBCOL. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for NBMIN, the minimum row dimension for blocks. +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of shifts. +*> +*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) +*> The values for MAXB, used in determining minimum blocksize. +*> +*> line 9: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 1 or 2) +*> +*> line 10: NBCOL, INTEGER array, dimension (NPARMS) +*> The values for NBCOL, the minimum column dimension for +*> blocks. +*> +*> line 11: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 12: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 13: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 14: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 15: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 15 was 2: +*> +*> line 16: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 17-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'DGG' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> DGS and DGV input files: +*> +*> line 1: 'DGS' or 'DGV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension(NN) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 17 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 7-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'DGS' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> DXV input files: +*> +*> line 1: 'DXV' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N lines contain the matrix A, one +*> row per line. The next N lines contain the matrix B. The +*> next line contains the reciprocals of the eigenvalue +*> condition numbers. The last line contains the reciprocals of +*> the eigenvector condition numbers. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> DGX input files: +*> +*> line 1: 'DGX' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next line contains an integer k such +*> that only the last k eigenvalues will be selected and appear +*> in the leading diagonal blocks of $A$ and $B$. The next N +*> lines contain the matrix A, one row per line. The next N +*> lines contain the matrix B. The last line contains the +*> reciprocal of the eigenvalue cluster condition number and the +*> reciprocal of the deflating subspace (associated with the +*> selected eigencluster) condition number. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> DSB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix dimension K. +*> +*> line 6: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'DSB'. +*> +*>----------------------------------------------------------------------- +*> +*> DBB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix bandwidth K. +*> +*> line 6: NPARMS, INTEGER +*> Number of values of the parameter NRHS +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 9 was 2: +*> +*> line 10: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 10-EOF: Lines specifying matrix types, as for SVD. +*> The 3-character path name is 'DBB'. +*> +*>----------------------------------------------------------------------- +*> +*> DEC input file: +*> +*> line 2: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> lines 3-EOF: +*> +*> Input for testing the eigencondition routines consists of a set of +*> specially constructed test cases and their solutions. The data +*> format is not intended to be modified by the user. +*> +*>----------------------------------------------------------------------- +*> +*> DBL and DBK input files: +*> +*> line 1: 'DBL' in columns 1-3 to test SGEBAL, or 'DBK' in +*> columns 1-3 to test SGEBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> DGL and DGK input files: +*> +*> line 1: 'DGL' in columns 1-3 to test DGGBAL, or 'DGK' in +*> columns 1-3 to test DGGBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> GLM data file: +*> +*> line 1: 'GLM' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension), note M <= N <= M+P. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GLM' for the generalized +*> linear regression model routines. +*> +*>----------------------------------------------------------------------- +*> +*> GQR data file: +*> +*> line 1: 'GQR' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GQR' for the generalized +*> QR and RQ routines. +*> +*>----------------------------------------------------------------------- +*> +*> GSV data file: +*> +*> line 1: 'GSV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> CSD data file: +*> +*> line 1: 'CSD' in columns 1 to 3. +*> +*> line 2: NM, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NM) +*> Values of M (row and column dimension of orthogonal matrix). +*> +*> line 4: PVAL, INTEGER array, dimension(NM) +*> Values of P (row dimension of top-left block). +*> +*> line 5: NVAL, INTEGER array, dimension(NM) +*> Values of N (column dimension of top-left block). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CSD' for the CSD routine. +*> +*>----------------------------------------------------------------------- +*> +*> LSE data file: +*> +*> line 1: 'LSE' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N, note P <= N <= P+M. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> NMAX is currently set to 132 and must be at least 12 for some of the +*> precomputed examples, and LWORK = NMAX*(5*NMAX+5)+1 in the parameter +*> statements below. For SVD, we assume NRHS may be as big as N. The +*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for DGG. +*> \endverbatim +* +* Arguments: +* ========== +* +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup double_eig +* +* ===================================================================== + PROGRAM DCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER NCMAX + PARAMETER ( NCMAX = 20 ) + INTEGER NEED + PARAMETER ( NEED = 14 ) + INTEGER LWORK + PARAMETER ( LWORK = NMAX*( 5*NMAX+5 )+1 ) + INTEGER LIWORK + PARAMETER ( LIWORK = NMAX*( 5*NMAX+20 ) ) + INTEGER MAXIN + PARAMETER ( MAXIN = 20 ) + INTEGER MAXT + PARAMETER ( MAXT = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) +* .. +* .. Local Scalars .. + LOGICAL CSD, DBB, DGG, DSB, FATAL, GLM, GQR, GSV, LSE, + $ NEP, DBK, DBL, SEP, DES, DEV, DGK, DGL, DGS, + $ DGV, DGX, DSX, SVD, DVX, DXV, TSTCHK, TSTDIF, + $ TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*3 C3, PATH + CHARACTER*32 VNAME + CHARACTER*10 INTSTR + CHARACTER*80 LINE + INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, + $ NK, NN, NPARMS, NRHS, NTYPES, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) + INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), + $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), + $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ PVAL( MAXIN ) + INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), + $ ISHFTS( MAXIN ), IACC22( MAXIN ) + DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. +* .. External Functions .. + LOGICAL LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, DCHKBB, DCHKBD, DCHKBK, DCHKBL, DCHKEC, + $ DCHKGG, DCHKGK, DCHKGL, DCHKHS, DCHKSB, DCHKST, + $ DCKCSD, DCKGLM, DCKGQR, DCKGSV, DCKLSE, DDRGES, + $ DDRGEV, DDRGSX, DDRGVX, DDRVBD, DDRVES, DDRVEV, + $ DDRVSG, DDRVST, DDRVSX, DDRVVX, DERRBD, + $ DERRED, DERRGG, DERRHS, DERRST, ILAVER, XLAENV, + $ DDRGES3, DDRGEV3, + $ DCHKST2STG, DDRVST2STG, DCHKSB2STG, DDRVSG2STG +* .. +* .. Intrinsic Functions .. + INTRINSIC LEN, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, + $ SELOPT +* .. +* .. Arrays in Common .. + LOGICAL SELVAL( 20 ) + INTEGER IPARMS( 100 ) + DOUBLE PRECISION SELWI( 20 ), SELWR( 20 ) +* .. +* .. Common blocks .. + COMMON / CENVIR / NPROC, NSHIFT, MAXB + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA INTSTR / '0123456789' / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + A = 0.0 + B = 0.0 + C = 0.0 + D = 0.0 + S1 = DSECND( ) + FATAL = .FALSE. + NUNIT = NOUT +* +* Return to here to read multiple sets of data +* + 10 CONTINUE +* +* Read the first line and set the 3-character test path +* + READ( NIN, FMT = '(A80)', END = 380 )LINE + PATH = LINE( 1: 3 ) + NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'DHS' ) + SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'DST' ) .OR. + $ LSAMEN( 3, PATH, 'DSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) + SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'DBD' ) + DEV = LSAMEN( 3, PATH, 'DEV' ) + DES = LSAMEN( 3, PATH, 'DES' ) + DVX = LSAMEN( 3, PATH, 'DVX' ) + DSX = LSAMEN( 3, PATH, 'DSX' ) + DGG = LSAMEN( 3, PATH, 'DGG' ) + DGS = LSAMEN( 3, PATH, 'DGS' ) + DGX = LSAMEN( 3, PATH, 'DGX' ) + DGV = LSAMEN( 3, PATH, 'DGV' ) + DXV = LSAMEN( 3, PATH, 'DXV' ) + DSB = LSAMEN( 3, PATH, 'DSB' ) + DBB = LSAMEN( 3, PATH, 'DBB' ) + GLM = LSAMEN( 3, PATH, 'GLM' ) + GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) + GSV = LSAMEN( 3, PATH, 'GSV' ) + CSD = LSAMEN( 3, PATH, 'CSD' ) + LSE = LSAMEN( 3, PATH, 'LSE' ) + DBL = LSAMEN( 3, PATH, 'DBL' ) + DBK = LSAMEN( 3, PATH, 'DBK' ) + DGL = LSAMEN( 3, PATH, 'DGL' ) + DGK = LSAMEN( 3, PATH, 'DGK' ) +* +* Report values of parameters. +* + IF( PATH.EQ.' ' ) THEN + GO TO 10 + ELSE IF( NEP ) THEN + WRITE( NOUT, FMT = 9987 ) + ELSE IF( SEP ) THEN + WRITE( NOUT, FMT = 9986 ) + ELSE IF( SVD ) THEN + WRITE( NOUT, FMT = 9985 ) + ELSE IF( DEV ) THEN + WRITE( NOUT, FMT = 9979 ) + ELSE IF( DES ) THEN + WRITE( NOUT, FMT = 9978 ) + ELSE IF( DVX ) THEN + WRITE( NOUT, FMT = 9977 ) + ELSE IF( DSX ) THEN + WRITE( NOUT, FMT = 9976 ) + ELSE IF( DGG ) THEN + WRITE( NOUT, FMT = 9975 ) + ELSE IF( DGS ) THEN + WRITE( NOUT, FMT = 9964 ) + ELSE IF( DGX ) THEN + WRITE( NOUT, FMT = 9965 ) + ELSE IF( DGV ) THEN + WRITE( NOUT, FMT = 9963 ) + ELSE IF( DXV ) THEN + WRITE( NOUT, FMT = 9962 ) + ELSE IF( DSB ) THEN + WRITE( NOUT, FMT = 9974 ) + ELSE IF( DBB ) THEN + WRITE( NOUT, FMT = 9967 ) + ELSE IF( GLM ) THEN + WRITE( NOUT, FMT = 9971 ) + ELSE IF( GQR ) THEN + WRITE( NOUT, FMT = 9970 ) + ELSE IF( GSV ) THEN + WRITE( NOUT, FMT = 9969 ) + ELSE IF( CSD ) THEN + WRITE( NOUT, FMT = 9960 ) + ELSE IF( LSE ) THEN + WRITE( NOUT, FMT = 9968 ) + ELSE IF( DBL ) THEN +* +* DGEBAL: Balancing +* + CALL DCHKBL( NIN, NOUT ) + GO TO 10 + ELSE IF( DBK ) THEN +* +* DGEBAK: Back transformation +* + CALL DCHKBK( NIN, NOUT ) + GO TO 10 + ELSE IF( DGL ) THEN +* +* DGGBAL: Balancing +* + CALL DCHKGL( NIN, NOUT ) + GO TO 10 + ELSE IF( DGK ) THEN +* +* DGGBAK: Back transformation +* + CALL DCHKGK( NIN, NOUT ) + GO TO 10 + ELSE IF( LSAMEN( 3, PATH, 'DEC' ) ) THEN +* +* DEC: Eigencondition estimation +* + READ( NIN, FMT = * )THRESH + CALL XLAENV( 1, 1 ) + CALL XLAENV( 12, 11 ) + CALL XLAENV( 13, 2 ) + CALL XLAENV( 14, 0 ) + CALL XLAENV( 15, 2 ) + CALL XLAENV( 16, 2 ) + TSTERR = .TRUE. + CALL DCHKEC( THRESH, TSTERR, NIN, NOUT ) + GO TO 10 + ELSE + WRITE( NOUT, FMT = 9992 )PATH + GO TO 10 + END IF + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH + WRITE( NOUT, FMT = 9984 ) +* +* Read the number of values of M, P, and N. +* + READ( NIN, FMT = * )NN + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF +* +* Read the values of M +* + IF( .NOT.( DGX .OR. DXV ) ) THEN + READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) + IF( SVD ) THEN + VNAME = ' M ' + ELSE + VNAME = ' N ' + END IF + DO 20 I = 1, NN + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) + END IF +* +* Read the values of P +* + IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN + READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) + DO 30 I = 1, NN + IF( PVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( PVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX + FATAL = .TRUE. + END IF + 30 CONTINUE + WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) + END IF +* +* Read the values of N +* + IF( SVD .OR. DBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. + $ LSE ) THEN + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 40 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 40 CONTINUE + ELSE + DO 50 I = 1, NN + NVAL( I ) = MVAL( I ) + 50 CONTINUE + END IF + IF( .NOT.( DGX .OR. DXV ) ) THEN + WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) + ELSE + WRITE( NOUT, FMT = 9983 )'N: ', NN + END IF +* +* Read the number of values of K, followed by the values of K +* + IF( DSB .OR. DBB ) THEN + READ( NIN, FMT = * )NK + READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) + DO 60 I = 1, NK + IF( KVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( KVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX + FATAL = .TRUE. + END IF + 60 CONTINUE + WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) + END IF +* + IF( DEV .OR. DES .OR. DVX .OR. DSX ) THEN +* +* For the nonsymmetric QR driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INWIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INIBL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( ISHFTS( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 + FATAL = .TRUE. + ELSE IF( IACC22( 1 ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) + CALL XLAENV(13, INWIN( 1 ) ) + CALL XLAENV(14, INIBL( 1 ) ) + CALL XLAENV(15, ISHFTS( 1 ) ) + CALL XLAENV(16, IACC22( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) + WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) +* + ELSEIF( DGS .OR. DGX .OR. DGV .OR. DXV ) THEN +* +* For the nonsymmetric generalized driver routines, only one set +* of parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ NSVAL( 1 ), MXBVAL( 1 ) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NSVAL( 1 ).LT.2 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 + FATAL = .TRUE. + ELSE IF( MXBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV( 4, NSVAL( 1 ) ) + CALL XLAENV( 8, MXBVAL( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) +* + ELSE IF( .NOT.DSB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. + $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN +* +* For the other paths, the number of parameters can be varied +* from the input file. Read the number of parameter values. +* + READ( NIN, FMT = * )NPARMS + IF( NPARMS.LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 + NPARMS = 0 + FATAL = .TRUE. + ELSE IF( NPARMS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN + NPARMS = 0 + FATAL = .TRUE. + END IF +* +* Read the values of NB +* + IF( .NOT.DBB ) THEN + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) + DO 70 I = 1, NPARMS + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 70 CONTINUE + WRITE( NOUT, FMT = 9983 )'NB: ', + $ ( NBVAL( I ), I = 1, NPARMS ) + END IF +* +* Read the values of NBMIN +* + IF( NEP .OR. SEP .OR. SVD .OR. DGG ) THEN + READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) + DO 80 I = 1, NPARMS + IF( NBMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 + FATAL = .TRUE. + ELSE IF( NBMIN( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX + FATAL = .TRUE. + END IF + 80 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBMIN:', + $ ( NBMIN( I ), I = 1, NPARMS ) + ELSE + DO 90 I = 1, NPARMS + NBMIN( I ) = 1 + 90 CONTINUE + END IF +* +* Read the values of NX +* + IF( NEP .OR. SEP .OR. SVD ) THEN + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) + DO 100 I = 1, NPARMS + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NXVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX + FATAL = .TRUE. + END IF + 100 CONTINUE + WRITE( NOUT, FMT = 9983 )'NX: ', + $ ( NXVAL( I ), I = 1, NPARMS ) + ELSE + DO 110 I = 1, NPARMS + NXVAL( I ) = 1 + 110 CONTINUE + END IF +* +* Read the values of NSHIFT (if DGG) or NRHS (if SVD +* or DBB). +* + IF( SVD .OR. DBB .OR. DGG ) THEN + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) + DO 120 I = 1, NPARMS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX + FATAL = .TRUE. + END IF + 120 CONTINUE + WRITE( NOUT, FMT = 9983 )'NS: ', + $ ( NSVAL( I ), I = 1, NPARMS ) + ELSE + DO 130 I = 1, NPARMS + NSVAL( I ) = 1 + 130 CONTINUE + END IF +* +* Read the values for MAXB. +* + IF( DGG ) THEN + READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) + DO 140 I = 1, NPARMS + IF( MXBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MXBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 140 CONTINUE + WRITE( NOUT, FMT = 9983 )'MAXB: ', + $ ( MXBVAL( I ), I = 1, NPARMS ) + ELSE + DO 150 I = 1, NPARMS + MXBVAL( I ) = 1 + 150 CONTINUE + END IF +* +* Read the values for INMIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) + DO 540 I = 1, NPARMS + IF( INMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 + FATAL = .TRUE. + END IF + 540 CONTINUE + WRITE( NOUT, FMT = 9983 )'INMIN: ', + $ ( INMIN( I ), I = 1, NPARMS ) + ELSE + DO 550 I = 1, NPARMS + INMIN( I ) = 1 + 550 CONTINUE + END IF +* +* Read the values for INWIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) + DO 560 I = 1, NPARMS + IF( INWIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 + FATAL = .TRUE. + END IF + 560 CONTINUE + WRITE( NOUT, FMT = 9983 )'INWIN: ', + $ ( INWIN( I ), I = 1, NPARMS ) + ELSE + DO 570 I = 1, NPARMS + INWIN( I ) = 1 + 570 CONTINUE + END IF +* +* Read the values for INIBL. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) + DO 580 I = 1, NPARMS + IF( INIBL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 + FATAL = .TRUE. + END IF + 580 CONTINUE + WRITE( NOUT, FMT = 9983 )'INIBL: ', + $ ( INIBL( I ), I = 1, NPARMS ) + ELSE + DO 590 I = 1, NPARMS + INIBL( I ) = 1 + 590 CONTINUE + END IF +* +* Read the values for ISHFTS. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) + DO 600 I = 1, NPARMS + IF( ISHFTS( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 + FATAL = .TRUE. + END IF + 600 CONTINUE + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', + $ ( ISHFTS( I ), I = 1, NPARMS ) + ELSE + DO 610 I = 1, NPARMS + ISHFTS( I ) = 1 + 610 CONTINUE + END IF +* +* Read the values for IACC22. +* + IF( NEP .OR. DGG ) THEN + READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) + DO 620 I = 1, NPARMS + IF( IACC22( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 + FATAL = .TRUE. + END IF + 620 CONTINUE + WRITE( NOUT, FMT = 9983 )'IACC22: ', + $ ( IACC22( I ), I = 1, NPARMS ) + ELSE + DO 630 I = 1, NPARMS + IACC22( I ) = 1 + 630 CONTINUE + END IF +* +* Read the values for NBCOL. +* + IF( DGG ) THEN + READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) + DO 160 I = 1, NPARMS + IF( NBCOL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBCOL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX + FATAL = .TRUE. + END IF + 160 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBCOL:', + $ ( NBCOL( I ), I = 1, NPARMS ) + ELSE + DO 170 I = 1, NPARMS + NBCOL( I ) = 1 + 170 CONTINUE + END IF + END IF +* +* Calculate and print the machine dependent constants. +* + WRITE( NOUT, FMT = * ) + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9981 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9981 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9981 )'precision', EPS +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9982 )THRESH + IF( SEP .OR. SVD .OR. DGG ) THEN +* +* Read the flag that indicates whether to test LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test driver routines. +* + READ( NIN, FMT = * )TSTDRV + END IF +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* +* Read the code describing how to set the random number seed. +* + READ( NIN, FMT = * )NEWSD +* +* If NEWSD = 2, read another line with 4 integers for the seed. +* + IF( NEWSD.EQ.2 ) + $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) +* + DO 180 I = 1, 4 + ISEED( I ) = IOLDSD( I ) + 180 CONTINUE +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Read the input lines indicating the test path and its parameters. +* The first three characters indicate the test path, and the number +* of test matrix types must be the first nonblank item in columns +* 4-80. +* + 190 CONTINUE +* + IF( .NOT.( DGX .OR. DXV ) ) THEN +* + 200 CONTINUE + READ( NIN, FMT = '(A80)', END = 380 )LINE + C3 = LINE( 1: 3 ) + LENP = LEN( LINE ) + I = 3 + ITMP = 0 + I1 = 0 + 210 CONTINUE + I = I + 1 + IF( I.GT.LENP ) THEN + IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + NTYPES = MAXT + GO TO 240 + END IF + END IF + IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN + I1 = I + C1 = LINE( I1: I1 ) +* +* Check that a valid integer was read +* + DO 220 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 230 + END IF + 220 CONTINUE + WRITE( NOUT, FMT = 9991 )I, LINE + GO TO 200 + 230 CONTINUE + ITMP = 10*ITMP + IC + GO TO 210 + ELSE IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + GO TO 210 + END IF + 240 CONTINUE + NTYPES = ITMP +* +* Skip the tests if NTYPES is <= 0. +* + IF( .NOT.( DEV .OR. DES .OR. DVX .OR. DSX .OR. DGV .OR. + $ DGS ) .AND. NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + GO TO 200 + END IF +* + ELSE + IF( DXV ) + $ C3 = 'DXV' + IF( DGX ) + $ C3 = 'DGX' + END IF +* +* Reset the random number seed. +* + IF( NEWSD.EQ.0 ) THEN + DO 250 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 250 CONTINUE + END IF +* + IF( LSAMEN( 3, C3, 'DHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN +* +* ------------------------------------- +* NEP: Nonsymmetric Eigenvalue Problem +* ------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NS = number of shifts +* MAXB = minimum submatrix size +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL DERRHS( 'DHSEQR', NOUT ) + DO 270 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) + CALL XLAENV(13, INWIN( I ) ) + CALL XLAENV(14, INIBL( I ) ) + CALL XLAENV(15, ISHFTS( I ) ) + CALL XLAENV(16, IACC22( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 260 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 260 CONTINUE + END IF + WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), MAX( 11, INMIN(I)), + $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) + CALL DCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), + $ A( 1, 7 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), A( 1, 8 ), + $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ D( 1, 7 ), WORK, LWORK, IWORK, LOGWRK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKHS', INFO + 270 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'DST' ) .OR. LSAMEN( 3, C3, 'SEP' ) + $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN +* +* ---------------------------------- +* SEP: Symmetric Eigenvalue Problem +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL DERRST( 'DST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF + DO 290 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 280 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 280 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL DCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), + $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL DCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), D( 1, 12 ), A( 1, 6 ), + $ WORK, LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKST', INFO + END IF + IF( TSTDRV ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL DDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 2 ), NMAX, + $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL DDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), D( 1, 8 ), D( 1, 9 ), + $ D( 1, 10 ), D( 1, 11 ), A( 1, 2 ), NMAX, + $ A( 1, 3 ), D( 1, 12 ), A( 1, 4 ), WORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRVST', INFO + END IF + 290 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'DSG' ) ) THEN +* +* ---------------------------------------------- +* DSG: Symmetric Generalized Eigenvalue Problem +* ---------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) + DO 310 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 300 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 300 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN +* CALL DDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, +* $ D( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), +* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, +* $ LWORK, IWORK, LIWORK, RESULT, INFO ) + CALL DDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ D( 1, 3 ), D( 1, 3 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), + $ A( 1, 7 ), WORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRVSG', INFO + END IF + 310 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'DBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN +* +* ---------------------------------- +* SVD: Singular Value Decomposition +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NRHS = number of right hand sides +* + MAXTYP = 16 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) +* +* Test the error exits +* + IF( TSTERR .AND. TSTCHK ) + $ CALL DERRBD( 'DBD', NOUT ) + IF( TSTERR .AND. TSTDRV ) + $ CALL DERRED( 'DBD', NOUT ) +* + DO 330 I = 1, NPARMS + NRHS = NSVAL( I ) + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + IF( NEWSD.EQ.0 ) THEN + DO 320 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 320 CONTINUE + END IF + WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), NRHS + IF( TSTCHK ) THEN + CALL DCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, + $ THRESH, A( 1, 1 ), NMAX, D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 2 ), + $ NMAX, A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), NMAX, + $ A( 1, 6 ), NMAX, A( 1, 7 ), A( 1, 8 ), WORK, + $ LWORK, IWORK, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKBD', INFO + END IF + IF( TSTDRV ) + $ CALL DDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ WORK, LWORK, IWORK, NOUT, INFO ) + 330 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'DEV' ) ) THEN +* +* -------------------------------------------- +* DEV: Nonsymmetric Eigenvalue Problem Driver +* DGEEV (eigenvalues and eigenvectors) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), + $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, RESULT, + $ WORK, LWORK, IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DGEEV', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DES' ) ) THEN +* +* -------------------------------------------- +* DES: Nonsymmetric Eigenvalue Problem Driver +* DGEES (Schur form) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ A( 1, 4 ), NMAX, RESULT, WORK, LWORK, IWORK, + $ LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DGEES', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DVX' ) ) THEN +* +* -------------------------------------------------------------- +* DVX: Nonsymmetric Eigenvalue Problem Expert Driver +* DGEEVX (eigenvalues, eigenvectors and condition numbers) +* -------------------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), A( 1, 3 ), + $ NMAX, A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, + $ D( 1, 5 ), D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), + $ D( 1, 9 ), D( 1, 10 ), D( 1, 11 ), D( 1, 12 ), + $ RESULT, WORK, LWORK, IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DGEEVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DSX' ) ) THEN +* +* --------------------------------------------------- +* DSX: Nonsymmetric Eigenvalue Problem Expert Driver +* DGEESX (Schur form and condition numbers) +* --------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), A( 1, 4 ), NMAX, + $ A( 1, 5 ), RESULT, WORK, LWORK, IWORK, LOGWRK, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DGEESX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DGG' ) ) THEN +* +* ------------------------------------------------- +* DGG: Generalized Nonsymmetric Eigenvalue Problem +* ------------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NS = number of shifts +* MAXB = minimum submatrix size +* IACC22: structured matrix multiply +* NBCOL = minimum column dimension for blocks +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV(1,1) + IF( TSTCHK .AND. TSTERR ) + $ CALL DERRGG( C3, NOUT ) + DO 350 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 4, NSVAL( I ) ) + CALL XLAENV( 8, MXBVAL( I ) ) + CALL XLAENV( 16, IACC22( I ) ) + CALL XLAENV( 5, NBCOL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 340 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 340 CONTINUE + END IF + WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), + $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) + TSTDIF = .FALSE. + THRSHN = 10.D0 + IF( TSTCHK ) THEN + CALL DCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), + $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), + $ D( 1, 5 ), D( 1, 6 ), A( 1, 13 ), + $ A( 1, 14 ), WORK, LWORK, LOGWRK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKGG', INFO + END IF + 350 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'DGS' ) ) THEN +* +* ------------------------------------------------- +* DGS: Generalized Nonsymmetric Eigenvalue Problem +* DGGES (Schur form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, + $ RESULT, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGES', INFO +* +* Blocked version +* + CALL XLAENV(16, 2) + CALL DDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), WORK, LWORK, + $ RESULT, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGES3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( DGX ) THEN +* +* ------------------------------------------------- +* DGX: Generalized Nonsymmetric Eigenvalue Problem +* DGGESX (Schur form and condition numbers) +* ------------------------------------------------- +* + MAXTYP = 5 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 5, 2 ) + CALL DDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), D( 1, 1 ), D( 1, 2 ), D( 1, 3 ), + $ C( 1, 1 ), NCMAX*NCMAX, A( 1, 12 ), WORK, + $ LWORK, IWORK, LIWORK, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGSX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DGV' ) ) THEN +* +* ------------------------------------------------- +* DGV: Generalized Nonsymmetric Eigenvalue Problem +* DGGEV (Eigenvalue/vector form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), + $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), + $ WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGEV', INFO +* +* Blocked version +* + CALL DDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, D( 1, 1 ), D( 1, 2 ), + $ D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), D( 1, 6 ), + $ WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGEV3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( DXV ) THEN +* +* ------------------------------------------------- +* DXV: Generalized Nonsymmetric Eigenvalue Problem +* DGGEVX (eigenvalue/vector with condition numbers) +* ------------------------------------------------- +* + MAXTYP = 2 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL DERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL DDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), A( 1, 5 ), A( 1, 6 ), + $ IWORK( 1 ), IWORK( 2 ), D( 1, 4 ), D( 1, 5 ), + $ D( 1, 6 ), D( 1, 7 ), D( 1, 8 ), D( 1, 9 ), + $ WORK, LWORK, IWORK( 3 ), LIWORK-2, RESULT, + $ LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DDRGVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'DSB' ) ) THEN +* +* ------------------------------ +* DSB: Symmetric Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + IF( TSTERR ) + $ CALL DERRST( 'DSB', NOUT ) +* CALL DCHKSB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), D( 1, 2 ), +* $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) + CALL DCHKSB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, NOUT, A( 1, 1 ), NMAX, D( 1, 1 ), + $ D( 1, 2 ), D( 1, 3 ), D( 1, 4 ), D( 1, 5 ), + $ A( 1, 2 ), NMAX, WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKSB', INFO +* + ELSE IF( LSAMEN( 3, C3, 'DBB' ) ) THEN +* +* ------------------------------ +* DBB: General Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + DO 370 I = 1, NPARMS + NRHS = NSVAL( I ) +* + IF( NEWSD.EQ.0 ) THEN + DO 360 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 360 CONTINUE + END IF + WRITE( NOUT, FMT = 9966 )C3, NRHS + CALL DCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, + $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), 2*NMAX, D( 1, 1 ), D( 1, 2 ), + $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), + $ NMAX, A( 1, 7 ), WORK, LWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCHKBB', INFO + 370 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN +* +* ----------------------------------------- +* GLM: Generalized Linear Regression Model +* ----------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL DERRGG( 'GLM', NOUT ) + CALL DCKGLM( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCKGLM', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN +* +* ------------------------------------------ +* GQR: Generalized QR and RQ factorizations +* ------------------------------------------ +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL DERRGG( 'GQR', NOUT ) + CALL DCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, + $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, D( 1, 1 ), NIN, + $ NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCKGQR', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN +* +* ---------------------------------------------- +* GSV: Generalized Singular Value Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL DERRGG( 'GSV', NOUT ) + CALL DCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), TAUA, TAUB, + $ B( 1, 4 ), IWORK, WORK, D( 1, 1 ), NIN, NOUT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCKGSV', INFO +* + ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN +* +* ---------------------------------------------- +* CSD: CS Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL DERRGG( 'CSD', NOUT ) + CALL DCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), IWORK, WORK, + $ D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCKCSD', INFO +* + ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN +* +* -------------------------------------- +* LSE: Constrained Linear Least Squares +* -------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL DERRGG( 'LSE', NOUT ) + CALL DCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, D( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'DCKLSE', INFO +* + ELSE + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9992 )C3 + END IF + IF( .NOT.( DGX .OR. DXV ) ) + $ GO TO 190 + 380 CONTINUE + WRITE( NOUT, FMT = 9994 ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) + 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, + $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) + 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', NRHS =', I4 ) + 9994 FORMAT( / / ' End of tests' ) + 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) + 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, + $ ' of input', ' line:', / A79 ) + 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) + 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', + $ I6 ) + 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', + $ I6 ) + 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) + 9986 FORMAT( ' Tests of the Symmetric Eigenvalue Problem routines' ) + 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) + 9984 FORMAT( / ' The following parameter values will be used:' ) + 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) + 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9981 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) + 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' DGEEV (eigenvalues and eigevectors)' ) + 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' DGEES (Schur form)' ) + 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' DGEEVX (eigenvalues, eigenvectors and', + $ ' condition numbers)' ) + 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' DGEESX (Schur form and condition', + $ ' numbers)' ) + 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem routines' ) + 9974 FORMAT( ' Tests of DSBTRD', / ' (reduction of a symmetric band ', + $ 'matrix to tridiagonal form)' ) + 9973 FORMAT( / 1X, 71( '-' ) ) + 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) + 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', + $ 'routines' ) + 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) + 9969 FORMAT( / ' Tests of the Generalized Singular Value', + $ ' Decomposition routines' ) + 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) + 9967 FORMAT( ' Tests of DGBBRD', / ' (reduction of a general band ', + $ 'matrix to real bidiagonal form)' ) + 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) + 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver DGGESX' ) + 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver DGGES' ) + 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver DGGEV' ) + 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver DGGEVX' ) + 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', INMIN=', I4, + $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, + $ ', IACC22 =', I4) + 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) +* +* End of DCHKEE +* + END From 9cf861e8faf21cbd623ef762127767d241a86088 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:51:03 +0100 Subject: [PATCH 129/681] Add rewritten cchkee.F from Reference-LAPACK PR335 --- lapack-netlib/TESTING/EIG/cchkee.F | 2553 ++++++++++++++++++++++++++++ 1 file changed, 2553 insertions(+) create mode 100644 lapack-netlib/TESTING/EIG/cchkee.F diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F new file mode 100644 index 000000000..0d3d7493c --- /dev/null +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -0,0 +1,2553 @@ +*> \brief \b CCHKEE +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CCHKEE +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKEE tests the COMPLEX LAPACK subroutines for the matrix +*> eigenvalue problem. The test paths in this version are +*> +*> NEP (Nonsymmetric Eigenvalue Problem): +*> Test CGEHRD, CUNGHR, CHSEQR, CTREVC, CHSEIN, and CUNMHR +*> +*> SEP (Hermitian Eigenvalue Problem): +*> Test CHETRD, CUNGTR, CSTEQR, CSTERF, CSTEIN, CSTEDC, +*> and drivers CHEEV(X), CHBEV(X), CHPEV(X), +*> CHEEVD, CHBEVD, CHPEVD +*> +*> SVD (Singular Value Decomposition): +*> Test CGEBRD, CUNGBR, and CBDSQR +*> and the drivers CGESVD, CGESDD +*> +*> CEV (Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test CGEEV +*> +*> CES (Nonsymmetric Schur form Driver): +*> Test CGEES +*> +*> CVX (Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test CGEEVX +*> +*> CSX (Nonsymmetric Schur form Expert Driver): +*> Test CGEESX +*> +*> CGG (Generalized Nonsymmetric Eigenvalue Problem): +*> Test CGGHD3, CGGBAL, CGGBAK, CHGEQZ, and CTGEVC +*> +*> CGS (Generalized Nonsymmetric Schur form Driver): +*> Test CGGES +*> +*> CGV (Generalized Nonsymmetric Eigenvalue/eigenvector Driver): +*> Test CGGEV +*> +*> CGX (Generalized Nonsymmetric Schur form Expert Driver): +*> Test CGGESX +*> +*> CXV (Generalized Nonsymmetric Eigenvalue/eigenvector Expert Driver): +*> Test CGGEVX +*> +*> CSG (Hermitian Generalized Eigenvalue Problem): +*> Test CHEGST, CHEGV, CHEGVD, CHEGVX, CHPGST, CHPGV, CHPGVD, +*> CHPGVX, CHBGST, CHBGV, CHBGVD, and CHBGVX +*> +*> CHB (Hermitian Band Eigenvalue Problem): +*> Test CHBTRD +*> +*> CBB (Band Singular Value Decomposition): +*> Test CGBBRD +*> +*> CEC (Eigencondition estimation): +*> Test CTRSYL, CTREXC, CTRSNA, and CTRSEN +*> +*> CBL (Balancing a general matrix) +*> Test CGEBAL +*> +*> CBK (Back transformation on a balanced matrix) +*> Test CGEBAK +*> +*> CGL (Balancing a matrix pair) +*> Test CGGBAL +*> +*> CGK (Back transformation on a matrix pair) +*> Test CGGBAK +*> +*> GLM (Generalized Linear Regression Model): +*> Tests CGGGLM +*> +*> GQR (Generalized QR and RQ factorizations): +*> Tests CGGQRF and CGGRQF +*> +*> GSV (Generalized Singular Value Decomposition): +*> Tests CGGSVD, CGGSVP, CTGSJA, CLAGS2, CLAPLL, and CLAPMT +*> +*> CSD (CS decomposition): +*> Tests CUNCSD +*> +*> LSE (Constrained Linear Least Squares): +*> Tests CGGLSE +*> +*> Each test path has a different set of inputs, but the data sets for +*> the driver routines xEV, xES, xVX, and xSX can be concatenated in a +*> single input file. The first line of input should contain one of the +*> 3-character path names in columns 1-3. The number of remaining lines +*> depends on what is found on the first line. +*> +*> The number of matrix types used in testing is often controllable from +*> the input file. The number of matrix types for each path, and the +*> test routine that describes them, is as follows: +*> +*> Path name(s) Types Test routine +*> +*> CHS or NEP 21 CCHKHS +*> CST or SEP 21 CCHKST (routines) +*> 18 CDRVST (drivers) +*> CBD or SVD 16 CCHKBD (routines) +*> 5 CDRVBD (drivers) +*> CEV 21 CDRVEV +*> CES 21 CDRVES +*> CVX 21 CDRVVX +*> CSX 21 CDRVSX +*> CGG 26 CCHKGG (routines) +*> CGS 26 CDRGES +*> CGX 5 CDRGSX +*> CGV 26 CDRGEV +*> CXV 2 CDRGVX +*> CSG 21 CDRVSG +*> CHB 15 CCHKHB +*> CBB 15 CCHKBB +*> CEC - CCHKEC +*> CBL - CCHKBL +*> CBK - CCHKBK +*> CGL - CCHKGL +*> CGK - CCHKGK +*> GLM 8 CCKGLM +*> GQR 8 CCKGQR +*> GSV 8 CCKGSV +*> CSD 3 CCKCSD +*> LSE 8 CCKLSE +*> +*>----------------------------------------------------------------------- +*> +*> NEP input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NX, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: INMIN, INTEGER array, dimension (NPARMS) +*> LAHQR vs TTQRE crossover point, >= 11 +*> +*> line 9: INWIN, INTEGER array, dimension (NPARMS) +*> recommended deflation window size +*> +*> line 10: INIBL, INTEGER array, dimension (NPARMS) +*> nibble crossover point +*> +*> line 11: ISHFTS, INTEGER array, dimension (NPARMS) +*> number of simultaneous shifts) +*> +*> line 12: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 0, 1 or 2) +*> +*> line 13: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. To have all of the test +*> ratios printed, use THRESH = 0.0 . +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: The remaining lines occur in sets of 1 or 2 and allow +*> the user to specify the matrix types. Each line contains +*> a 3-character path name in columns 1-3, and the number +*> of matrix types must be the first nonblank item in columns +*> 4-80. If the number of matrix types is at least 1 but is +*> less than the maximum number of possible types, a second +*> line will be read to get the numbers of the matrix types to +*> be used. For example, +*> NEP 21 +*> requests all of the matrix types for the nonsymmetric +*> eigenvalue problem, while +*> NEP 4 +*> 9 10 11 12 +*> requests only matrices of type 9, 10, 11, and 12. +*> +*> The valid 3-character path names are 'NEP' or 'CHS' for the +*> nonsymmetric eigenvalue routines. +*> +*>----------------------------------------------------------------------- +*> +*> SEP or CSG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, and NX. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 7: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 10: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 11: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 12: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 12 was 2: +*> +*> line 13: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 13-EOF: Lines specifying matrix types, as for NEP. +*> The valid 3-character path names are 'SEP' or 'CST' for the +*> Hermitian eigenvalue routines and driver routines, and +*> 'CSG' for the routines for the Hermitian generalized +*> eigenvalue problem. +*> +*>----------------------------------------------------------------------- +*> +*> SVD input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 5: NPARMS, INTEGER +*> Number of values of the parameter NB, NBMIN, NX, and NRHS. +*> +*> line 6: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 7: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for the minimum blocksize NBMIN. +*> +*> line 8: NXVAL, INTEGER array, dimension (NPARMS) +*> The values for the crossover point NX. +*> +*> line 9: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 10: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 11: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 12: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 13: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 14: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 14 was 2: +*> +*> line 15: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 15-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path names are 'SVD' or 'CBD' for both the +*> SVD routines and the SVD driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> CEV and CES data files: +*> +*> line 1: 'CEV' or 'CES' in columns 1 to 3. +*> +*> line 2: NSIZES, INTEGER +*> Number of sizes of matrices to use. Should be at least 0 +*> and at most 20. If NSIZES = 0, no testing is done +*> (although the remaining 3 lines are still read). +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHSEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8 and following: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CEV' to test CGEEV, or +*> 'CES' to test CGEES. +*> +*>----------------------------------------------------------------------- +*> +*> The CVX data has two parts. The first part is identical to CEV, +*> and the second part consists of test matrices with precomputed +*> solutions. +*> +*> line 1: 'CVX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: NEWSD, INTEGER +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> +*> lines 8 and following: The first line contains 'CVX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 1+N+N**2 lines, where N is +*> its dimension. The first line contains the dimension N and +*> ISRT (two integers). ISRT indicates whether the last N lines +*> are sorted by increasing real part of the eigenvalue +*> (ISRT=0) or by increasing imaginary part (ISRT=1). The next +*> N**2 lines contain the matrix rowwise, one entry per line. +*> The last N lines correspond to each eigenvalue. Each of +*> these last N lines contains 4 real values: the real part of +*> the eigenvalues, the imaginary part of the eigenvalue, the +*> reciprocal condition number of the eigenvalues, and the +*> reciprocal condition number of the vector eigenvector. The +*> end of data is indicated by dimension N=0. Even if no data +*> is to be tested, there must be at least one line containing +*> N=0. +*> +*>----------------------------------------------------------------------- +*> +*> The CSX data is like CVX. The first part is identical to CEV, and the +*> second part consists of test matrices with precomputed solutions. +*> +*> line 1: 'CSX' in columns 1-3. +*> +*> line 2: NSIZES, INTEGER +*> If NSIZES = 0, no testing of randomly generated examples +*> is done, but any precomputed examples are tested. +*> +*> line 3: NN, INTEGER array, dimension(NSIZES) +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> +*> line 5: THRESH, REAL +*> +*> line 6: NEWSD, INTEGER +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> +*> lines 8 and following: The first line contains 'CSX' in columns 1-3 +*> followed by the number of matrix types, possibly with +*> a second line to specify certain matrix types. +*> If the number of matrix types = 0, no testing of randomly +*> generated examples is done, but any precomputed examples +*> are tested. +*> +*> remaining lines : Each matrix is stored on 3+N**2 lines, where N is +*> its dimension. The first line contains the dimension N, the +*> dimension M of an invariant subspace, and ISRT. The second +*> line contains M integers, identifying the eigenvalues in the +*> invariant subspace (by their position in a list of +*> eigenvalues ordered by increasing real part (if ISRT=0) or +*> by increasing imaginary part (if ISRT=1)). The next N**2 +*> lines contain the matrix rowwise. The last line contains the +*> reciprocal condition number for the average of the selected +*> eigenvalues, and the reciprocal condition number for the +*> corresponding right invariant subspace. The end of data in +*> indicated by a line containing N=0, M=0, and ISRT = 0. Even +*> if no data is to be tested, there must be at least one line +*> containing N=0, M=0 and ISRT=0. +*> +*>----------------------------------------------------------------------- +*> +*> CGG input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NPARMS, INTEGER +*> Number of values of the parameters NB, NBMIN, NBCOL, NS, and +*> MAXB. +*> +*> line 5: NBVAL, INTEGER array, dimension (NPARMS) +*> The values for the blocksize NB. +*> +*> line 6: NBMIN, INTEGER array, dimension (NPARMS) +*> The values for NBMIN, the minimum row dimension for blocks. +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of shifts. +*> +*> line 8: MXBVAL, INTEGER array, dimension (NPARMS) +*> The values for MAXB, used in determining minimum blocksize. +*> +*> line 9: IACC22, INTEGER array, dimension (NPARMS) +*> select structured matrix multiply: 1 or 2) +*> +*> line 10: NBCOL, INTEGER array, dimension (NPARMS) +*> The values for NBCOL, the minimum column dimension for +*> blocks. +*> +*> line 11: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 12: TSTCHK, LOGICAL +*> Flag indicating whether or not to test the LAPACK routines. +*> +*> line 13: TSTDRV, LOGICAL +*> Flag indicating whether or not to test the driver routines. +*> +*> line 14: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 15: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 15 was 2: +*> +*> line 16: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 17-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CGG' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> CGS and CGV input files: +*> +*> line 1: 'CGS' or 'CGV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension(NN) +*> Dimensions of matrices to be tested. +*> +*> line 4: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 5: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> If it is 0., all test case data will be printed. +*> +*> line 6: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 17 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 7-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CGS' for the generalized +*> eigenvalue problem routines and driver routines. +*> +*>----------------------------------------------------------------------- +*> +*> CGX input file: +*> line 1: 'CGX' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next line contains an integer k such +*> that only the last k eigenvalues will be selected and appear +*> in the leading diagonal blocks of $A$ and $B$. The next N*N +*> lines contain the matrix A, one element per line. The next N*N +*> lines contain the matrix B. The last line contains the +*> reciprocal of the eigenvalue cluster condition number and the +*> reciprocal of the deflating subspace (associated with the +*> selected eigencluster) condition number. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> CXV input files: +*> line 1: 'CXV' in columns 1 to 3. +*> +*> line 2: N, INTEGER +*> Value of N. +*> +*> line 3: NB, NBMIN, NX, NS, NBCOL, INTEGERs +*> These integer parameters determine how blocking is done +*> (see ILAENV for details) +*> NB : block size +*> NBMIN : minimum block size +*> NX : minimum dimension for blocking +*> NS : number of shifts in xHGEQR +*> NBCOL : minimum column dimension for blocking +*> +*> line 4: THRESH, REAL +*> The test threshold against which computed residuals are +*> compared. Should generally be in the range from 10. to 20. +*> Information will be printed about each test for which the +*> test ratio is greater than or equal to the threshold. +*> +*> line 5: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 6: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 6 was 2: +*> +*> line 7: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> If line 2 was 0: +*> +*> line 7-EOF: Precomputed examples are tested. +*> +*> remaining lines : Each example is stored on 3+2*N*N lines, where N is +*> its dimension. The first line contains the dimension (a +*> single integer). The next N*N lines contain the matrix A, one +*> element per line. The next N*N lines contain the matrix B. +*> The next line contains the reciprocals of the eigenvalue +*> condition numbers. The last line contains the reciprocals of +*> the eigenvector condition numbers. The end of data is +*> indicated by dimension N=0. Even if no data is to be tested, +*> there must be at least one line containing N=0. +*> +*>----------------------------------------------------------------------- +*> +*> CHB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of N. +*> +*> line 3: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix dimension K. +*> +*> line 6: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 7 was 2: +*> +*> line 8: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 8-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CHB'. +*> +*>----------------------------------------------------------------------- +*> +*> CBB input file: +*> +*> line 2: NN, INTEGER +*> Number of values of M and N. +*> +*> line 3: MVAL, INTEGER array, dimension (NN) +*> The values for the matrix row dimension M. +*> +*> line 4: NVAL, INTEGER array, dimension (NN) +*> The values for the matrix column dimension N. +*> +*> line 4: NK, INTEGER +*> Number of values of K. +*> +*> line 5: KVAL, INTEGER array, dimension (NK) +*> The values for the matrix bandwidth K. +*> +*> line 6: NPARMS, INTEGER +*> Number of values of the parameter NRHS +*> +*> line 7: NSVAL, INTEGER array, dimension (NPARMS) +*> The values for the number of right hand sides NRHS. +*> +*> line 8: THRESH +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 9: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 9 was 2: +*> +*> line 10: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 10-EOF: Lines specifying matrix types, as for SVD. +*> The 3-character path name is 'CBB'. +*> +*>----------------------------------------------------------------------- +*> +*> CEC input file: +*> +*> line 2: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> lines 3-EOF: +*> +*> Input for testing the eigencondition routines consists of a set of +*> specially constructed test cases and their solutions. The data +*> format is not intended to be modified by the user. +*> +*>----------------------------------------------------------------------- +*> +*> CBL and CBK input files: +*> +*> line 1: 'CBL' in columns 1-3 to test CGEBAL, or 'CBK' in +*> columns 1-3 to test CGEBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> CGL and CGK input files: +*> +*> line 1: 'CGL' in columns 1-3 to test CGGBAL, or 'CGK' in +*> columns 1-3 to test CGGBAK. +*> +*> The remaining lines consist of specially constructed test cases. +*> +*>----------------------------------------------------------------------- +*> +*> GLM data file: +*> +*> line 1: 'GLM' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension), note M <= N <= M+P. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GLM' for the generalized +*> linear regression model routines. +*> +*>----------------------------------------------------------------------- +*> +*> GQR data file: +*> +*> line 1: 'GQR' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GQR' for the generalized +*> QR and RQ routines. +*> +*>----------------------------------------------------------------------- +*> +*> GSV data file: +*> +*> line 1: 'GSV' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M (row dimension). +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P (row dimension). +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N (column dimension). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> CSD data file: +*> +*> line 1: 'CSD' in columns 1 to 3. +*> +*> line 2: NM, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NM) +*> Values of M (row and column dimension of orthogonal matrix). +*> +*> line 4: PVAL, INTEGER array, dimension(NM) +*> Values of P (row dimension of top-left block). +*> +*> line 5: NVAL, INTEGER array, dimension(NM) +*> Values of N (column dimension of top-left block). +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'CSD' for the CSD routine. +*> +*>----------------------------------------------------------------------- +*> +*> LSE data file: +*> +*> line 1: 'LSE' in columns 1 to 3. +*> +*> line 2: NN, INTEGER +*> Number of values of M, P, and N. +*> +*> line 3: MVAL, INTEGER array, dimension(NN) +*> Values of M. +*> +*> line 4: PVAL, INTEGER array, dimension(NN) +*> Values of P. +*> +*> line 5: NVAL, INTEGER array, dimension(NN) +*> Values of N, note P <= N <= P+M. +*> +*> line 6: THRESH, REAL +*> Threshold value for the test ratios. Information will be +*> printed about each test for which the test ratio is greater +*> than or equal to the threshold. +*> +*> line 7: TSTERR, LOGICAL +*> Flag indicating whether or not to test the error exits for +*> the LAPACK routines and driver routines. +*> +*> line 8: NEWSD, INTEGER +*> A code indicating how to set the random number seed. +*> = 0: Set the seed to a default value before each run +*> = 1: Initialize the seed to a default value only before the +*> first run +*> = 2: Like 1, but use the seed values on the next line +*> +*> If line 8 was 2: +*> +*> line 9: INTEGER array, dimension (4) +*> Four integer values for the random number seed. +*> +*> lines 9-EOF: Lines specifying matrix types, as for NEP. +*> The 3-character path name is 'GSV' for the generalized +*> SVD routines. +*> +*>----------------------------------------------------------------------- +*> +*> NMAX is currently set to 132 and must be at least 12 for some of the +*> precomputed examples, and LWORK = NMAX*(5*NMAX+20) in the parameter +*> statements below. For SVD, we assume NRHS may be as big as N. The +*> parameter NEED is set to 14 to allow for 14 N-by-N matrices for CGG. +*> \endverbatim +* +* Arguments: +* ========== +* +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \date June 2016 +* +*> \ingroup complex_eig +* +* ===================================================================== + PROGRAM CCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif +* +* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* June 2016 +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER NCMAX + PARAMETER ( NCMAX = 20 ) + INTEGER NEED + PARAMETER ( NEED = 14 ) + INTEGER LWORK + PARAMETER ( LWORK = NMAX*( 5*NMAX+20 ) ) + INTEGER LIWORK + PARAMETER ( LIWORK = NMAX*( NMAX+20 ) ) + INTEGER MAXIN + PARAMETER ( MAXIN = 20 ) + INTEGER MAXT + PARAMETER ( MAXT = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) +* .. +* .. Local Scalars .. + LOGICAL CBB, CBK, CBL, CES, CEV, CGG, CGK, CGL, CGS, + $ CGV, CGX, CHB, CSD, CSX, CVX, CXV, FATAL, GLM, + $ GQR, GSV, LSE, NEP, SEP, SVD, TSTCHK, TSTDIF, + $ TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*3 C3, PATH + CHARACTER*32 VNAME + CHARACTER*10 INTSTR + CHARACTER*80 LINE + INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, + $ NK, NN, NPARMS, NRHS, NTYPES, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + REAL EPS, S1, S2, THRESH, THRSHN +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MAXT ), LOGWRK( NMAX ) + INTEGER IOLDSD( 4 ), ISEED( 4 ), IWORK( LIWORK ), + $ KVAL( MAXIN ), MVAL( MAXIN ), MXBVAL( MAXIN ), + $ NBCOL( MAXIN ), NBMIN( MAXIN ), NBVAL( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ PVAL( MAXIN ) + INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), + $ ISHFTS( MAXIN ), IACC22( MAXIN ) + REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), + $ RESULT( 500 ) + COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), + $ X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. +* .. External Functions .. + LOGICAL LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, CCHKBB, CCHKBD, CCHKBK, CCHKBL, CCHKEC, + $ CCHKGG, CCHKGK, CCHKGL, CCHKHB, CCHKHS, CCHKST, + $ CCKCSD, CCKGLM, CCKGQR, CCKGSV, CCKLSE, CDRGES, + $ CDRGEV, CDRGSX, CDRGVX, CDRVBD, CDRVES, CDRVEV, + $ CDRVSG, CDRVST, CDRVSX, CDRVVX, CERRBD, + $ CERRED, CERRGG, CERRHS, CERRST, ILAVER, XLAENV, + $ CDRGES3, CDRGEV3, + $ CCHKST2STG, CDRVST2STG, CCHKHB2STG +* .. +* .. Intrinsic Functions .. + INTRINSIC LEN, MIN +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, MAXB, NPROC, NSHIFT, NUNIT, SELDIM, + $ SELOPT +* .. +* .. Arrays in Common .. + LOGICAL SELVAL( 20 ) + INTEGER IPARMS( 100 ) + REAL SELWI( 20 ), SELWR( 20 ) +* .. +* .. Common blocks .. + COMMON / CENVIR / NPROC, NSHIFT, MAXB + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / SSLCT / SELOPT, SELDIM, SELVAL, SELWR, SELWI +* .. +* .. Data statements .. + DATA INTSTR / '0123456789' / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + A = 0.0 + B = 0.0 + C = 0.0 + DC = 0.0 + S1 = SECOND( ) + FATAL = .FALSE. + NUNIT = NOUT +* +* Return to here to read multiple sets of data +* + 10 CONTINUE +* +* Read the first line and set the 3-character test path +* + READ( NIN, FMT = '(A80)', END = 380 )LINE + PATH = LINE( 1: 3 ) + NEP = LSAMEN( 3, PATH, 'NEP' ) .OR. LSAMEN( 3, PATH, 'CHS' ) + SEP = LSAMEN( 3, PATH, 'SEP' ) .OR. LSAMEN( 3, PATH, 'CST' ) .OR. + $ LSAMEN( 3, PATH, 'CSG' ) .OR. LSAMEN( 3, PATH, 'SE2' ) + SVD = LSAMEN( 3, PATH, 'SVD' ) .OR. LSAMEN( 3, PATH, 'CBD' ) + CEV = LSAMEN( 3, PATH, 'CEV' ) + CES = LSAMEN( 3, PATH, 'CES' ) + CVX = LSAMEN( 3, PATH, 'CVX' ) + CSX = LSAMEN( 3, PATH, 'CSX' ) + CGG = LSAMEN( 3, PATH, 'CGG' ) + CGS = LSAMEN( 3, PATH, 'CGS' ) + CGX = LSAMEN( 3, PATH, 'CGX' ) + CGV = LSAMEN( 3, PATH, 'CGV' ) + CXV = LSAMEN( 3, PATH, 'CXV' ) + CHB = LSAMEN( 3, PATH, 'CHB' ) + CBB = LSAMEN( 3, PATH, 'CBB' ) + GLM = LSAMEN( 3, PATH, 'GLM' ) + GQR = LSAMEN( 3, PATH, 'GQR' ) .OR. LSAMEN( 3, PATH, 'GRQ' ) + GSV = LSAMEN( 3, PATH, 'GSV' ) + CSD = LSAMEN( 3, PATH, 'CSD' ) + LSE = LSAMEN( 3, PATH, 'LSE' ) + CBL = LSAMEN( 3, PATH, 'CBL' ) + CBK = LSAMEN( 3, PATH, 'CBK' ) + CGL = LSAMEN( 3, PATH, 'CGL' ) + CGK = LSAMEN( 3, PATH, 'CGK' ) +* +* Report values of parameters. +* + IF( PATH.EQ.' ' ) THEN + GO TO 10 + ELSE IF( NEP ) THEN + WRITE( NOUT, FMT = 9987 ) + ELSE IF( SEP ) THEN + WRITE( NOUT, FMT = 9986 ) + ELSE IF( SVD ) THEN + WRITE( NOUT, FMT = 9985 ) + ELSE IF( CEV ) THEN + WRITE( NOUT, FMT = 9979 ) + ELSE IF( CES ) THEN + WRITE( NOUT, FMT = 9978 ) + ELSE IF( CVX ) THEN + WRITE( NOUT, FMT = 9977 ) + ELSE IF( CSX ) THEN + WRITE( NOUT, FMT = 9976 ) + ELSE IF( CGG ) THEN + WRITE( NOUT, FMT = 9975 ) + ELSE IF( CGS ) THEN + WRITE( NOUT, FMT = 9964 ) + ELSE IF( CGX ) THEN + WRITE( NOUT, FMT = 9965 ) + ELSE IF( CGV ) THEN + WRITE( NOUT, FMT = 9963 ) + ELSE IF( CXV ) THEN + WRITE( NOUT, FMT = 9962 ) + ELSE IF( CHB ) THEN + WRITE( NOUT, FMT = 9974 ) + ELSE IF( CBB ) THEN + WRITE( NOUT, FMT = 9967 ) + ELSE IF( GLM ) THEN + WRITE( NOUT, FMT = 9971 ) + ELSE IF( GQR ) THEN + WRITE( NOUT, FMT = 9970 ) + ELSE IF( GSV ) THEN + WRITE( NOUT, FMT = 9969 ) + ELSE IF( CSD ) THEN + WRITE( NOUT, FMT = 9960 ) + ELSE IF( LSE ) THEN + WRITE( NOUT, FMT = 9968 ) + ELSE IF( CBL ) THEN +* +* CGEBAL: Balancing +* + CALL CCHKBL( NIN, NOUT ) + GO TO 380 + ELSE IF( CBK ) THEN +* +* CGEBAK: Back transformation +* + CALL CCHKBK( NIN, NOUT ) + GO TO 380 + ELSE IF( CGL ) THEN +* +* CGGBAL: Balancing +* + CALL CCHKGL( NIN, NOUT ) + GO TO 380 + ELSE IF( CGK ) THEN +* +* CGGBAK: Back transformation +* + CALL CCHKGK( NIN, NOUT ) + GO TO 380 + ELSE IF( LSAMEN( 3, PATH, 'CEC' ) ) THEN +* +* CEC: Eigencondition estimation +* + READ( NIN, FMT = * )THRESH + CALL XLAENV( 1, 1 ) + CALL XLAENV( 12, 1 ) + TSTERR = .TRUE. + CALL CCHKEC( THRESH, TSTERR, NIN, NOUT ) + GO TO 380 + ELSE + WRITE( NOUT, FMT = 9992 )PATH + GO TO 380 + END IF + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9972 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH + WRITE( NOUT, FMT = 9984 ) +* +* Read the number of values of M, P, and N. +* + READ( NIN, FMT = * )NN + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF +* +* Read the values of M +* + IF( .NOT.( CGX .OR. CXV ) ) THEN + READ( NIN, FMT = * )( MVAL( I ), I = 1, NN ) + IF( SVD ) THEN + VNAME = ' M ' + ELSE + VNAME = ' N ' + END IF + DO 20 I = 1, NN + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )VNAME, MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )VNAME, MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + WRITE( NOUT, FMT = 9983 )'M: ', ( MVAL( I ), I = 1, NN ) + END IF +* +* Read the values of P +* + IF( GLM .OR. GQR .OR. GSV .OR. CSD .OR. LSE ) THEN + READ( NIN, FMT = * )( PVAL( I ), I = 1, NN ) + DO 30 I = 1, NN + IF( PVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' P ', PVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( PVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' P ', PVAL( I ), NMAX + FATAL = .TRUE. + END IF + 30 CONTINUE + WRITE( NOUT, FMT = 9983 )'P: ', ( PVAL( I ), I = 1, NN ) + END IF +* +* Read the values of N +* + IF( SVD .OR. CBB .OR. GLM .OR. GQR .OR. GSV .OR. CSD .OR. + $ LSE ) THEN + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 40 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 40 CONTINUE + ELSE + DO 50 I = 1, NN + NVAL( I ) = MVAL( I ) + 50 CONTINUE + END IF + IF( .NOT.( CGX .OR. CXV ) ) THEN + WRITE( NOUT, FMT = 9983 )'N: ', ( NVAL( I ), I = 1, NN ) + ELSE + WRITE( NOUT, FMT = 9983 )'N: ', NN + END IF +* +* Read the number of values of K, followed by the values of K +* + IF( CHB .OR. CBB ) THEN + READ( NIN, FMT = * )NK + READ( NIN, FMT = * )( KVAL( I ), I = 1, NK ) + DO 60 I = 1, NK + IF( KVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' K ', KVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( KVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' K ', KVAL( I ), NMAX + FATAL = .TRUE. + END IF + 60 CONTINUE + WRITE( NOUT, FMT = 9983 )'K: ', ( KVAL( I ), I = 1, NK ) + END IF +* + IF( CEV .OR. CES .OR. CVX .OR. CSX ) THEN +* +* For the nonsymmetric QR driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ INMIN( 1 ), INWIN( 1 ), INIBL(1), ISHFTS(1), IACC22(1) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INWIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( INIBL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( ISHFTS( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( 1 ), 1 + FATAL = .TRUE. + ELSE IF( IACC22( 1 ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( 1 ), 0 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV(12, MAX( 11, INMIN( 1 ) ) ) + CALL XLAENV(13, INWIN( 1 ) ) + CALL XLAENV(14, INIBL( 1 ) ) + CALL XLAENV(15, ISHFTS( 1 ) ) + CALL XLAENV(16, IACC22( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'INMIN: ', INMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INWIN: ', INWIN( 1 ) + WRITE( NOUT, FMT = 9983 )'INIBL: ', INIBL( 1 ) + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', ISHFTS( 1 ) + WRITE( NOUT, FMT = 9983 )'IACC22: ', IACC22( 1 ) +* + ELSE IF( CGS .OR. CGX .OR. CGV .OR. CXV ) THEN +* +* For the nonsymmetric generalized driver routines, only one set of +* parameters is allowed. +* + READ( NIN, FMT = * )NBVAL( 1 ), NBMIN( 1 ), NXVAL( 1 ), + $ NSVAL( 1 ), MXBVAL( 1 ) + IF( NBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NBMIN( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NXVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( 1 ), 1 + FATAL = .TRUE. + ELSE IF( NSVAL( 1 ).LT.2 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( 1 ), 2 + FATAL = .TRUE. + ELSE IF( MXBVAL( 1 ).LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( 1 ), 1 + FATAL = .TRUE. + END IF + CALL XLAENV( 1, NBVAL( 1 ) ) + CALL XLAENV( 2, NBMIN( 1 ) ) + CALL XLAENV( 3, NXVAL( 1 ) ) + CALL XLAENV( 4, NSVAL( 1 ) ) + CALL XLAENV( 8, MXBVAL( 1 ) ) + WRITE( NOUT, FMT = 9983 )'NB: ', NBVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NBMIN:', NBMIN( 1 ) + WRITE( NOUT, FMT = 9983 )'NX: ', NXVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'NS: ', NSVAL( 1 ) + WRITE( NOUT, FMT = 9983 )'MAXB: ', MXBVAL( 1 ) + ELSE IF( .NOT.CHB .AND. .NOT.GLM .AND. .NOT.GQR .AND. .NOT. + $ GSV .AND. .NOT.CSD .AND. .NOT.LSE ) THEN +* +* For the other paths, the number of parameters can be varied +* from the input file. Read the number of parameter values. +* + READ( NIN, FMT = * )NPARMS + IF( NPARMS.LT.1 ) THEN + WRITE( NOUT, FMT = 9989 )'NPARMS', NPARMS, 1 + NPARMS = 0 + FATAL = .TRUE. + ELSE IF( NPARMS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9988 )'NPARMS', NPARMS, MAXIN + NPARMS = 0 + FATAL = .TRUE. + END IF +* +* Read the values of NB +* + IF( .NOT.CBB ) THEN + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NPARMS ) + DO 70 I = 1, NPARMS + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NB ', NBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 70 CONTINUE + WRITE( NOUT, FMT = 9983 )'NB: ', + $ ( NBVAL( I ), I = 1, NPARMS ) + END IF +* +* Read the values of NBMIN +* + IF( NEP .OR. SEP .OR. SVD .OR. CGG ) THEN + READ( NIN, FMT = * )( NBMIN( I ), I = 1, NPARMS ) + DO 80 I = 1, NPARMS + IF( NBMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBMIN ', NBMIN( I ), 0 + FATAL = .TRUE. + ELSE IF( NBMIN( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBMIN ', NBMIN( I ), NMAX + FATAL = .TRUE. + END IF + 80 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBMIN:', + $ ( NBMIN( I ), I = 1, NPARMS ) + ELSE + DO 90 I = 1, NPARMS + NBMIN( I ) = 1 + 90 CONTINUE + END IF +* +* Read the values of NX +* + IF( NEP .OR. SEP .OR. SVD ) THEN + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NPARMS ) + DO 100 I = 1, NPARMS + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NXVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NX ', NXVAL( I ), NMAX + FATAL = .TRUE. + END IF + 100 CONTINUE + WRITE( NOUT, FMT = 9983 )'NX: ', + $ ( NXVAL( I ), I = 1, NPARMS ) + ELSE + DO 110 I = 1, NPARMS + NXVAL( I ) = 1 + 110 CONTINUE + END IF +* +* Read the values of NSHIFT (if CGG) or NRHS (if SVD +* or CBB). +* + IF( SVD .OR. CBB .OR. CGG ) THEN + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NPARMS ) + DO 120 I = 1, NPARMS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' NS ', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' NS ', NSVAL( I ), NMAX + FATAL = .TRUE. + END IF + 120 CONTINUE + WRITE( NOUT, FMT = 9983 )'NS: ', + $ ( NSVAL( I ), I = 1, NPARMS ) + ELSE + DO 130 I = 1, NPARMS + NSVAL( I ) = 1 + 130 CONTINUE + END IF +* +* Read the values for MAXB. +* + IF( CGG ) THEN + READ( NIN, FMT = * )( MXBVAL( I ), I = 1, NPARMS ) + DO 140 I = 1, NPARMS + IF( MXBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' MAXB ', MXBVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MXBVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )' MAXB ', MXBVAL( I ), NMAX + FATAL = .TRUE. + END IF + 140 CONTINUE + WRITE( NOUT, FMT = 9983 )'MAXB: ', + $ ( MXBVAL( I ), I = 1, NPARMS ) + ELSE + DO 150 I = 1, NPARMS + MXBVAL( I ) = 1 + 150 CONTINUE + END IF +* +* Read the values for INMIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INMIN( I ), I = 1, NPARMS ) + DO 540 I = 1, NPARMS + IF( INMIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INMIN ', INMIN( I ), 0 + FATAL = .TRUE. + END IF + 540 CONTINUE + WRITE( NOUT, FMT = 9983 )'INMIN: ', + $ ( INMIN( I ), I = 1, NPARMS ) + ELSE + DO 550 I = 1, NPARMS + INMIN( I ) = 1 + 550 CONTINUE + END IF +* +* Read the values for INWIN. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INWIN( I ), I = 1, NPARMS ) + DO 560 I = 1, NPARMS + IF( INWIN( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INWIN ', INWIN( I ), 0 + FATAL = .TRUE. + END IF + 560 CONTINUE + WRITE( NOUT, FMT = 9983 )'INWIN: ', + $ ( INWIN( I ), I = 1, NPARMS ) + ELSE + DO 570 I = 1, NPARMS + INWIN( I ) = 1 + 570 CONTINUE + END IF +* +* Read the values for INIBL. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( INIBL( I ), I = 1, NPARMS ) + DO 580 I = 1, NPARMS + IF( INIBL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' INIBL ', INIBL( I ), 0 + FATAL = .TRUE. + END IF + 580 CONTINUE + WRITE( NOUT, FMT = 9983 )'INIBL: ', + $ ( INIBL( I ), I = 1, NPARMS ) + ELSE + DO 590 I = 1, NPARMS + INIBL( I ) = 1 + 590 CONTINUE + END IF +* +* Read the values for ISHFTS. +* + IF( NEP ) THEN + READ( NIN, FMT = * )( ISHFTS( I ), I = 1, NPARMS ) + DO 600 I = 1, NPARMS + IF( ISHFTS( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' ISHFTS ', ISHFTS( I ), 0 + FATAL = .TRUE. + END IF + 600 CONTINUE + WRITE( NOUT, FMT = 9983 )'ISHFTS: ', + $ ( ISHFTS( I ), I = 1, NPARMS ) + ELSE + DO 610 I = 1, NPARMS + ISHFTS( I ) = 1 + 610 CONTINUE + END IF +* +* Read the values for IACC22. +* + IF( NEP .OR. CGG ) THEN + READ( NIN, FMT = * )( IACC22( I ), I = 1, NPARMS ) + DO 620 I = 1, NPARMS + IF( IACC22( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )' IACC22 ', IACC22( I ), 0 + FATAL = .TRUE. + END IF + 620 CONTINUE + WRITE( NOUT, FMT = 9983 )'IACC22: ', + $ ( IACC22( I ), I = 1, NPARMS ) + ELSE + DO 630 I = 1, NPARMS + IACC22( I ) = 1 + 630 CONTINUE + END IF +* +* Read the values for NBCOL. +* + IF( CGG ) THEN + READ( NIN, FMT = * )( NBCOL( I ), I = 1, NPARMS ) + DO 160 I = 1, NPARMS + IF( NBCOL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9989 )'NBCOL ', NBCOL( I ), 0 + FATAL = .TRUE. + ELSE IF( NBCOL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9988 )'NBCOL ', NBCOL( I ), NMAX + FATAL = .TRUE. + END IF + 160 CONTINUE + WRITE( NOUT, FMT = 9983 )'NBCOL:', + $ ( NBCOL( I ), I = 1, NPARMS ) + ELSE + DO 170 I = 1, NPARMS + NBCOL( I ) = 1 + 170 CONTINUE + END IF + END IF +* +* Calculate and print the machine dependent constants. +* + WRITE( NOUT, FMT = * ) + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9981 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9981 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9981 )'precision', EPS +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9982 )THRESH + IF( SEP .OR. SVD .OR. CGG ) THEN +* +* Read the flag that indicates whether to test LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test driver routines. +* + READ( NIN, FMT = * )TSTDRV + END IF +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* +* Read the code describing how to set the random number seed. +* + READ( NIN, FMT = * )NEWSD +* +* If NEWSD = 2, read another line with 4 integers for the seed. +* + IF( NEWSD.EQ.2 ) + $ READ( NIN, FMT = * )( IOLDSD( I ), I = 1, 4 ) +* + DO 180 I = 1, 4 + ISEED( I ) = IOLDSD( I ) + 180 CONTINUE +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Read the input lines indicating the test path and its parameters. +* The first three characters indicate the test path, and the number +* of test matrix types must be the first nonblank item in columns +* 4-80. +* + 190 CONTINUE +* + IF( .NOT.( CGX .OR. CXV ) ) THEN +* + 200 CONTINUE + READ( NIN, FMT = '(A80)', END = 380 )LINE + C3 = LINE( 1: 3 ) + LENP = LEN( LINE ) + I = 3 + ITMP = 0 + I1 = 0 + 210 CONTINUE + I = I + 1 + IF( I.GT.LENP ) THEN + IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + NTYPES = MAXT + GO TO 240 + END IF + END IF + IF( LINE( I: I ).NE.' ' .AND. LINE( I: I ).NE.',' ) THEN + I1 = I + C1 = LINE( I1: I1 ) +* +* Check that a valid integer was read +* + DO 220 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 230 + END IF + 220 CONTINUE + WRITE( NOUT, FMT = 9991 )I, LINE + GO TO 200 + 230 CONTINUE + ITMP = 10*ITMP + IC + GO TO 210 + ELSE IF( I1.GT.0 ) THEN + GO TO 240 + ELSE + GO TO 210 + END IF + 240 CONTINUE + NTYPES = ITMP +* +* Skip the tests if NTYPES is <= 0. +* + IF( .NOT.( CEV .OR. CES .OR. CVX .OR. CSX .OR. CGV .OR. + $ CGS ) .AND. NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + GO TO 200 + END IF +* + ELSE + IF( CGX ) + $ C3 = 'CGX' + IF( CXV ) + $ C3 = 'CXV' + END IF +* +* Reset the random number seed. +* + IF( NEWSD.EQ.0 ) THEN + DO 250 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 250 CONTINUE + END IF +* + IF( LSAMEN( 3, C3, 'CHS' ) .OR. LSAMEN( 3, C3, 'NEP' ) ) THEN +* +* ------------------------------------- +* NEP: Nonsymmetric Eigenvalue Problem +* ------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NS = number of shifts +* MAXB = minimum submatrix size +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL CERRHS( 'CHSEQR', NOUT ) + DO 270 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + CALL XLAENV(12, MAX( 11, INMIN( I ) ) ) + CALL XLAENV(13, INWIN( I ) ) + CALL XLAENV(14, INIBL( I ) ) + CALL XLAENV(15, ISHFTS( I ) ) + CALL XLAENV(16, IACC22( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 260 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 260 CONTINUE + END IF + WRITE( NOUT, FMT = 9961 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), MAX( 11, INMIN(I)), + $ INWIN( I ), INIBL( I ), ISHFTS( I ), IACC22( I ) + CALL CCHKHS( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 5 ), NMAX, A( 1, 6 ), + $ A( 1, 7 ), DC( 1, 1 ), DC( 1, 2 ), A( 1, 8 ), + $ A( 1, 9 ), A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ DC( 1, 3 ), WORK, LWORK, RWORK, IWORK, LOGWRK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKHS', INFO + 270 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'CST' ) .OR. LSAMEN( 3, C3, 'SEP' ) + $ .OR. LSAMEN( 3, C3, 'SE2' ) ) THEN +* +* ---------------------------------- +* SEP: Symmetric Eigenvalue Problem +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 1, 1 ) + CALL XLAENV( 9, 25 ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF + DO 290 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 280 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 280 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL CCHKST2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), + $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), + $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + ELSE + CALL CCHKST( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), + $ DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), DR( 1, 11 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), DC( 1, 1 ), A( 1, 6 ), + $ WORK, LWORK, RWORK, LWORK, IWORK, LIWORK, + $ RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKST', INFO + END IF + IF( TSTDRV ) THEN + IF( LSAMEN( 3, C3, 'SE2' ) ) THEN + CALL CDRVST2STG( NN, NVAL, 18, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), + $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), + $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ELSE + CALL CDRVST( NN, NVAL, 18, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, DR( 1, 3 ), DR( 1, 4 ), + $ DR( 1, 5 ), DR( 1, 8 ), DR( 1, 9 ), + $ DR( 1, 10 ), A( 1, 2 ), NMAX, A( 1, 3 ), + $ DC( 1, 1 ), A( 1, 4 ), WORK, LWORK, RWORK, + $ LWORK, IWORK, LIWORK, RESULT, INFO ) + ENDIF + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRVST', INFO + END IF + 290 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'CSG' ) ) THEN +* +* ---------------------------------------------- +* CSG: Hermitian Generalized Eigenvalue Problem +* ---------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) + DO 310 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 300 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 300 CONTINUE + END IF + WRITE( NOUT, FMT = 9997 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ) + IF( TSTCHK ) THEN +* CALL CDRVSG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, +* $ DR( 1, 3 ), A( 1, 3 ), NMAX, A( 1, 4 ), +* $ A( 1, 5 ), A( 1, 6 ), A( 1, 7 ), WORK, +* $ LWORK, RWORK, LWORK, IWORK, LIWORK, RESULT, +* $ INFO ) + CALL CDRVSG2STG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ DR( 1, 3 ), DR( 1, 4 ), A( 1, 3 ), NMAX, + $ A( 1, 4 ), A( 1, 5 ), A( 1, 6 ), + $ A( 1, 7 ), WORK, LWORK, RWORK, LWORK, + $ IWORK, LIWORK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRVSG', INFO + END IF + 310 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'CBD' ) .OR. LSAMEN( 3, C3, 'SVD' ) ) THEN +* +* ---------------------------------- +* SVD: Singular Value Decomposition +* ---------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NX = crossover point +* NRHS = number of right hand sides +* + MAXTYP = 16 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 9, 25 ) +* +* Test the error exits +* + CALL XLAENV( 1, 1 ) + IF( TSTERR .AND. TSTCHK ) + $ CALL CERRBD( 'CBD', NOUT ) + IF( TSTERR .AND. TSTDRV ) + $ CALL CERRED( 'CBD', NOUT ) +* + DO 330 I = 1, NPARMS + NRHS = NSVAL( I ) + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 3, NXVAL( I ) ) + IF( NEWSD.EQ.0 ) THEN + DO 320 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 320 CONTINUE + END IF + WRITE( NOUT, FMT = 9995 )C3, NBVAL( I ), NBMIN( I ), + $ NXVAL( I ), NRHS + IF( TSTCHK ) THEN + CALL CCHKBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, NRHS, ISEED, + $ THRESH, A( 1, 1 ), NMAX, DR( 1, 1 ), + $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), + $ A( 1, 2 ), NMAX, A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), NMAX, A( 1, 6 ), NMAX, A( 1, 7 ), + $ A( 1, 8 ), WORK, LWORK, RWORK, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKBD', INFO + END IF + IF( TSTDRV ) + $ CALL CDRVBD( NN, MVAL, NVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, A( 1, 1 ), NMAX, A( 1, 2 ), NMAX, + $ A( 1, 3 ), NMAX, A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), DR( 1, 1 ), DR( 1, 2 ), + $ DR( 1, 3 ), WORK, LWORK, RWORK, IWORK, NOUT, + $ INFO ) + 330 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'CEV' ) ) THEN +* +* -------------------------------------------- +* CEV: Nonsymmetric Eigenvalue Problem Driver +* CGEEV (eigenvalues and eigenvectors) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRVEV( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, + $ A( 1, 5 ), NMAX, RESULT, WORK, LWORK, RWORK, + $ IWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CGEEV', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CES' ) ) THEN +* +* -------------------------------------------- +* CES: Nonsymmetric Eigenvalue Problem Driver +* CGEES (Schur form) +* -------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRVES( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ DC( 1, 1 ), DC( 1, 2 ), A( 1, 4 ), NMAX, + $ RESULT, WORK, LWORK, RWORK, IWORK, LOGWRK, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CGEES', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CVX' ) ) THEN +* +* -------------------------------------------------------------- +* CVX: Nonsymmetric Eigenvalue Problem Expert Driver +* CGEEVX (eigenvalues, eigenvectors and condition numbers) +* -------------------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRVVX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 3 ), NMAX, A( 1, 4 ), NMAX, + $ A( 1, 5 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), + $ DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), + $ DR( 1, 7 ), DR( 1, 8 ), RESULT, WORK, LWORK, + $ RWORK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CGEEVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CSX' ) ) THEN +* +* --------------------------------------------------- +* CSX: Nonsymmetric Eigenvalue Problem Expert Driver +* CGEESX (Schur form and condition numbers) +* --------------------------------------------------- +* + MAXTYP = 21 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRED( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRVSX( NN, NVAL, NTYPES, DOTYPE, ISEED, THRESH, NIN, + $ NOUT, A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), A( 1, 4 ), + $ NMAX, A( 1, 5 ), RESULT, WORK, LWORK, RWORK, + $ LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CGEESX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CGG' ) ) THEN +* +* ------------------------------------------------- +* CGG: Generalized Nonsymmetric Eigenvalue Problem +* ------------------------------------------------- +* Vary the parameters +* NB = block size +* NBMIN = minimum block size +* NS = number of shifts +* MAXB = minimum submatrix size +* IACC22: structured matrix multiply +* NBCOL = minimum column dimension for blocks +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV(1,1) + IF( TSTCHK .AND. TSTERR ) + $ CALL CERRGG( C3, NOUT ) + DO 350 I = 1, NPARMS + CALL XLAENV( 1, NBVAL( I ) ) + CALL XLAENV( 2, NBMIN( I ) ) + CALL XLAENV( 4, NSVAL( I ) ) + CALL XLAENV( 8, MXBVAL( I ) ) + CALL XLAENV( 16, IACC22( I ) ) + CALL XLAENV( 5, NBCOL( I ) ) +* + IF( NEWSD.EQ.0 ) THEN + DO 340 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 340 CONTINUE + END IF + WRITE( NOUT, FMT = 9996 )C3, NBVAL( I ), NBMIN( I ), + $ NSVAL( I ), MXBVAL( I ), IACC22( I ), NBCOL( I ) + TSTDIF = .FALSE. + THRSHN = 10. + IF( TSTCHK ) THEN + CALL CCHKGG( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, + $ TSTDIF, THRSHN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), A( 1, 7 ), A( 1, 8 ), A( 1, 9 ), + $ NMAX, A( 1, 10 ), A( 1, 11 ), A( 1, 12 ), + $ DC( 1, 1 ), DC( 1, 2 ), DC( 1, 3 ), + $ DC( 1, 4 ), A( 1, 13 ), A( 1, 14 ), WORK, + $ LWORK, RWORK, LOGWRK, RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKGG', INFO + END IF + 350 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'CGS' ) ) THEN +* +* ------------------------------------------------- +* CGS: Generalized Nonsymmetric Eigenvalue Problem +* CGGES (Schur form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRGES( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGES', INFO +* +* Blocked version +* + CALL XLAENV(16,2) + CALL CDRGES3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ DC( 1, 1 ), DC( 1, 2 ), WORK, LWORK, RWORK, + $ RESULT, LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGES3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + + GO TO 10 +* + ELSE IF( CGX ) THEN +* +* ------------------------------------------------- +* CGX Generalized Nonsymmetric Eigenvalue Problem +* CGGESX (Schur form and condition numbers) +* ------------------------------------------------- +* + MAXTYP = 5 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL XLAENV( 5, 2 ) + CALL CDRGSX( NN, NCMAX, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ A( 1, 6 ), DC( 1, 1 ), DC( 1, 2 ), C, + $ NCMAX*NCMAX, S, WORK, LWORK, RWORK, IWORK, + $ LIWORK, LOGWRK, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGSX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CGV' ) ) THEN +* +* ------------------------------------------------- +* CGV: Generalized Nonsymmetric Eigenvalue Problem +* CGGEV (Eigenvalue/vector form) +* ------------------------------------------------- +* + MAXTYP = 26 + NTYPES = MIN( MAXTYP, NTYPES ) + IF( NTYPES.LE.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRGEV( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), + $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGEV', INFO +* +* Blocked version +* + CALL XLAENV(16,2) + CALL CDRGEV3( NN, NVAL, MAXTYP, DOTYPE, ISEED, THRESH, NOUT, + $ A( 1, 1 ), NMAX, A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), A( 1, 7 ), NMAX, A( 1, 8 ), + $ A( 1, 9 ), NMAX, DC( 1, 1 ), DC( 1, 2 ), + $ DC( 1, 3 ), DC( 1, 4 ), WORK, LWORK, RWORK, + $ RESULT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGEV3', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( CXV ) THEN +* +* ------------------------------------------------- +* CXV: Generalized Nonsymmetric Eigenvalue Problem +* CGGEVX (eigenvalue/vector with condition numbers) +* ------------------------------------------------- +* + MAXTYP = 2 + NTYPES = MAXTYP + IF( NN.LT.0 ) THEN + WRITE( NOUT, FMT = 9990 )C3 + ELSE + IF( TSTERR ) + $ CALL CERRGG( C3, NOUT ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + CALL CDRGVX( NN, THRESH, NIN, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), DC( 1, 1 ), + $ DC( 1, 2 ), A( 1, 5 ), A( 1, 6 ), IWORK( 1 ), + $ IWORK( 2 ), DR( 1, 1 ), DR( 1, 2 ), DR( 1, 3 ), + $ DR( 1, 4 ), DR( 1, 5 ), DR( 1, 6 ), WORK, + $ LWORK, RWORK, IWORK( 3 ), LIWORK-2, RESULT, + $ LOGWRK, INFO ) +* + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CDRGVX', INFO + END IF + WRITE( NOUT, FMT = 9973 ) + GO TO 10 +* + ELSE IF( LSAMEN( 3, C3, 'CHB' ) ) THEN +* +* ------------------------------ +* CHB: Hermitian Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_NUM_THREADS() + CALL OMP_SET_NUM_THREADS(1) +#endif + CALL CERRST( 'CHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF +* CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, +* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), +* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, +* $ INFO ) + CALL CCHKHB2STG( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, + $ THRESH, NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), + $ DR( 1, 2 ), DR( 1, 3 ), DR( 1, 4 ), DR( 1, 5 ), + $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKHB', INFO +* + ELSE IF( LSAMEN( 3, C3, 'CBB' ) ) THEN +* +* ------------------------------ +* CBB: General Band Reduction +* ------------------------------ +* + MAXTYP = 15 + NTYPES = MIN( MAXTYP, NTYPES ) + CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) + DO 370 I = 1, NPARMS + NRHS = NSVAL( I ) +* + IF( NEWSD.EQ.0 ) THEN + DO 360 K = 1, 4 + ISEED( K ) = IOLDSD( K ) + 360 CONTINUE + END IF + WRITE( NOUT, FMT = 9966 )C3, NRHS + CALL CCHKBB( NN, MVAL, NVAL, NK, KVAL, MAXTYP, DOTYPE, NRHS, + $ ISEED, THRESH, NOUT, A( 1, 1 ), NMAX, + $ A( 1, 2 ), 2*NMAX, DR( 1, 1 ), DR( 1, 2 ), + $ A( 1, 4 ), NMAX, A( 1, 5 ), NMAX, A( 1, 6 ), + $ NMAX, A( 1, 7 ), WORK, LWORK, RWORK, RESULT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCHKBB', INFO + 370 CONTINUE +* + ELSE IF( LSAMEN( 3, C3, 'GLM' ) ) THEN +* +* ----------------------------------------- +* GLM: Generalized Linear Regression Model +* ----------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL CERRGG( 'GLM', NOUT ) + CALL CCKGLM( NN, NVAL, MVAL, PVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCKGLM', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GQR' ) ) THEN +* +* ------------------------------------------ +* GQR: Generalized QR and RQ factorizations +* ------------------------------------------ +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL CERRGG( 'GQR', NOUT ) + CALL CCKGQR( NN, MVAL, NN, PVAL, NN, NVAL, NTYPES, ISEED, + $ THRESH, NMAX, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ A( 1, 4 ), TAUA, B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ B( 1, 4 ), B( 1, 5 ), TAUB, WORK, DR( 1, 1 ), NIN, + $ NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCKGQR', INFO +* + ELSE IF( LSAMEN( 3, C3, 'GSV' ) ) THEN +* +* ---------------------------------------------- +* GSV: Generalized Singular Value Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL CERRGG( 'GSV', NOUT ) + CALL CCKGSV( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ A( 1, 3 ), B( 1, 3 ), A( 1, 4 ), ALPHA, BETA, + $ B( 1, 4 ), IWORK, WORK, DR( 1, 1 ), NIN, NOUT, + $ INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCKGSV', INFO +* + ELSE IF( LSAMEN( 3, C3, 'CSD' ) ) THEN +* +* ---------------------------------------------- +* CSD: CS Decomposition +* ---------------------------------------------- +* + CALL XLAENV(1,1) + IF( TSTERR ) + $ CALL CERRGG( 'CSD', NOUT ) + CALL CCKCSD( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), + $ A( 1, 5 ), A( 1, 6 ), RWORK, IWORK, WORK, + $ DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCKCSD', INFO +* + ELSE IF( LSAMEN( 3, C3, 'LSE' ) ) THEN +* +* -------------------------------------- +* LSE: Constrained Linear Least Squares +* -------------------------------------- +* + CALL XLAENV( 1, 1 ) + IF( TSTERR ) + $ CALL CERRGG( 'LSE', NOUT ) + CALL CCKLSE( NN, MVAL, PVAL, NVAL, NTYPES, ISEED, THRESH, NMAX, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), X, + $ WORK, DR( 1, 1 ), NIN, NOUT, INFO ) + IF( INFO.NE.0 ) + $ WRITE( NOUT, FMT = 9980 )'CCKLSE', INFO + ELSE + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9992 )C3 + END IF + IF( .NOT.( CGX .OR. CXV ) ) + $ GO TO 190 + 380 CONTINUE + WRITE( NOUT, FMT = 9994 ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) + 9996 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NS =', I4, + $ ', MAXB =', I4, ', IACC22 =', I4, ', NBCOL =', I4 ) + 9995 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', NRHS =', I4 ) + 9994 FORMAT( / / ' End of tests' ) + 9993 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9992 FORMAT( 1X, A3, ': Unrecognized path name' ) + 9991 FORMAT( / / ' *** Invalid integer value in column ', I2, + $ ' of input', ' line:', / A79 ) + 9990 FORMAT( / / 1X, A3, ' routines were not tested' ) + 9989 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be >=', + $ I6 ) + 9988 FORMAT( ' Invalid input value: ', A, '=', I6, '; must be <=', + $ I6 ) + 9987 FORMAT( ' Tests of the Nonsymmetric Eigenvalue Problem routines' ) + 9986 FORMAT( ' Tests of the Hermitian Eigenvalue Problem routines' ) + 9985 FORMAT( ' Tests of the Singular Value Decomposition routines' ) + 9984 FORMAT( / ' The following parameter values will be used:' ) + 9983 FORMAT( 4X, A, 10I6, / 10X, 10I6 ) + 9982 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9981 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9980 FORMAT( ' *** Error code from ', A, ' = ', I4 ) + 9979 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' CGEEV (eigenvalues and eigevectors)' ) + 9978 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Driver', + $ / ' CGEES (Schur form)' ) + 9977 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' CGEEVX (eigenvalues, eigenvectors and', + $ ' condition numbers)' ) + 9976 FORMAT( / ' Tests of the Nonsymmetric Eigenvalue Problem Expert', + $ ' Driver', / ' CGEESX (Schur form and condition', + $ ' numbers)' ) + 9975 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem routines' ) + 9974 FORMAT( ' Tests of CHBTRD', / ' (reduction of a Hermitian band ', + $ 'matrix to real tridiagonal form)' ) + 9973 FORMAT( / 1X, 71( '-' ) ) + 9972 FORMAT( / ' LAPACK VERSION ', I1, '.', I1, '.', I1 ) + 9971 FORMAT( / ' Tests of the Generalized Linear Regression Model ', + $ 'routines' ) + 9970 FORMAT( / ' Tests of the Generalized QR and RQ routines' ) + 9969 FORMAT( / ' Tests of the Generalized Singular Value', + $ ' Decomposition routines' ) + 9968 FORMAT( / ' Tests of the Linear Least Squares routines' ) + 9967 FORMAT( ' Tests of CGBBRD', / ' (reduction of a general band ', + $ 'matrix to real bidiagonal form)' ) + 9966 FORMAT( / / 1X, A3, ': NRHS =', I4 ) + 9965 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver CGGESX' ) + 9964 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver CGGES' ) + 9963 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Driver CGGEV' ) + 9962 FORMAT( / ' Tests of the Generalized Nonsymmetric Eigenvalue ', + $ 'Problem Expert Driver CGGEVX' ) + 9961 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4, + $ ', INMIN=', I4, + $ ', INWIN =', I4, ', INIBL =', I4, ', ISHFTS =', I4, + $ ', IACC22 =', I4) + 9960 FORMAT( / ' Tests of the CS Decomposition routines' ) +* +* End of CCHKEE +* + END From 90c1776c86339dfcd61ae07935f448a8b10346a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:53:20 +0100 Subject: [PATCH 130/681] Adjust build rules for ?chkee.F --- lapack-netlib/TESTING/EIG/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index b3efebcd0..a292e4496 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -157,11 +157,11 @@ cleanobj: cleanexe: rm -f xeigtst* -schkee.o: schkee.f +schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkee.o: dchkee.f +dchkee.o: dchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkee.o: cchkee.f +cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkee.o: zchkee.f +zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< From 9564f688c490bf0dabfa8226d3643d749f7ffff5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 28 Feb 2021 18:57:05 +0100 Subject: [PATCH 131/681] Adjust build rules for ?chkee.F --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e877b1422..10c25a446 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -25,7 +25,7 @@ set(AEIGTST set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f ssvdch.f ssvdct.f ssxt1.f) -set(SEIGTST schkee.f +set(SEIGTST schkee.F sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f schkbb.f schkbd.f schkbk.f schkbl.f schkec.f schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f @@ -42,7 +42,7 @@ set(SEIGTST schkee.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyt21.f ssyt22.f) -set(CEIGTST cchkee.f +set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) -set(DEIGTST dchkee.f +set(DEIGTST dchkee.F dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyt21.f dsyt22.f) -set(ZEIGTST zchkee.f +set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f From 20f492c2984913b6b278be2ae6bbb057026bfc52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 1 Mar 2021 21:00:10 +0100 Subject: [PATCH 132/681] Fix AMD AOCC compiler detection --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 848c38797..91a078565 100644 --- a/Makefile.system +++ b/Makefile.system @@ -904,8 +904,8 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) -ifeq ($(FLANG_VENDOR),AOCC) +FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") +ifeq ($(FLANG_VENDOR), AMD) FCOMMON_OPT += -fno-unroll-loops endif endif From 38dcf3454bf4d3a4b5b470791277904c025d7369 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 2 Mar 2021 17:50:55 +0100 Subject: [PATCH 133/681] Support timing Apple M1 --- benchmark/bench.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmark/bench.h b/benchmark/bench.h index 1f9b8986c..83de8ab2b 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){ #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; +#elif defined(__APPLE__) + mach_timebase_info_data_t info; + uint64_t start = 0, stop = 0; #else struct timespec start = { 0, 0 }, stop = { 0, 0 }; #endif @@ -82,6 +85,9 @@ double getsec() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#elif defined(__APPLE__) + mach_timebase_info(&info); + return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; #else return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; #endif @@ -90,6 +96,8 @@ double getsec() void begin() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &start, (struct timezone *)0); +#elif defined(__APPLE__) + start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &start); #endif @@ -98,7 +106,9 @@ void begin() { void end() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &stop, (struct timezone *)0); +#elif defined(__APPLE__) + stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &stop); #endif -} \ No newline at end of file +} From 41646ed006b25167417a5b56ad37e20c9632851c Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 5 Mar 2021 16:22:36 -0600 Subject: [PATCH 134/681] Optimize s/dasum function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/dasum.c | 20 +++- kernel/power/dasum_microk_power10.c | 152 +++++++++++++++++++++++++++ kernel/power/sasum.c | 20 +++- kernel/power/sasum_microk_power10.c | 153 ++++++++++++++++++++++++++++ 4 files changed, 343 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dasum_microk_power10.c create mode 100644 kernel/power/sasum_microk_power10.c diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 999dc677a..0cdec3292 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" +#elif defined(POWER10) +#include "dasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -16; + if ( n1 > 0 ) + { + sumf += dasum_kernel_16(n1, &x[i]); + i+=n1; + } +#else n1 = n & -16; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = dasum_kernel_16(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c new file mode 100644 index 000000000..d1a21b4d1 --- /dev/null +++ b/kernel/power/dasum_microk_power10.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static double dasum_kernel_16 (long n, double *x) +{ + double sum; + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "addic. %1, %1, -16 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + XXSWAPD_S(33,32) + "xsadddp %x0, 32, 33 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=d" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} + + diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 733137012..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" +#elif defined(POWER10) +#include "sasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -32; + if ( n1 > 0 ) + { + sumf += sasum_kernel_32(n1, &x[i]); + i+=n1; + } +#else n1 = n & -32; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = sasum_kernel_32(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c new file mode 100644 index 000000000..ea12a4264 --- /dev/null +++ b/kernel/power/sasum_microk_power10.c @@ -0,0 +1,153 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_32 1 + +static float sasum_kernel_32 (long n, float *x) +{ + float sum; + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "addic. %1, %1, -32 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=f" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} From a9f6f7ad390fea938c45a0e4b3b8feb2c1841edf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 6 Mar 2021 14:35:49 +0100 Subject: [PATCH 135/681] Remove spurious AVX512 requirement and add AVX2/FMA3 guard --- kernel/x86_64/srot_microk_haswell-2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index 8e245cc8f..b5545726e 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -1,5 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_SROT_KERNEL 1 From 09d47af2c0451b7d5868e9aeec200b565a6bf25f Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 10 Mar 2021 17:15:33 -0600 Subject: [PATCH 136/681] Optimize zscal function for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. --- kernel/power/zscal.c | 2 +- kernel/power/zscal_microk_power10.c | 195 ++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zscal_microk_power10.c diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 31b3682b9..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #elif defined(POWER10) #if defined(DOUBLE) -#include "zscal_microk_power8.c" +#include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c new file mode 100644 index 000000000..15b8323f4 --- /dev/null +++ b/kernel/power/zscal_microk_power10.c @@ -0,0 +1,195 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) +{ + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xsnegdp 33, %x10 \n\t" // -alpha_i + XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r + XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + + + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + + + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + + + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" + + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%9,%10) \n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5) // 8 + : + "d" (alpha_r), // 9 + "d" (alpha_i) // 10 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} From 9579bd47e53fb65cca2f27e92340d1c08e745068 Mon Sep 17 00:00:00 2001 From: austinpagan Date: Wed, 10 Mar 2021 18:19:12 -0500 Subject: [PATCH 137/681] Modifying a couple paramaters in the "POWER10"-specific section of param.h, for performance enhancements for SGEMM and DGEMM. --- param.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/param.h b/param.h index 9ba25de6a..262f52c88 100644 --- a/param.h +++ b/param.h @@ -2455,13 +2455,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 832 -#define DGEMM_DEFAULT_P 320 +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1026 -#define DGEMM_DEFAULT_Q 960 +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 512 #define CGEMM_DEFAULT_Q 1026 #define ZGEMM_DEFAULT_Q 1026 From 3c356b1a1f0d2a6b2209a6ca908212cfafe53971 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Mar 2021 11:51:09 +0100 Subject: [PATCH 138/681] Support compilation with the NAG Fortran compiler --- Makefile.system | 18 ++++++++++++++++++ Makefile.x86_64 | 14 ++++++++++++++ f_check | 38 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 91a078565..ae703e4d9 100644 --- a/Makefile.system +++ b/Makefile.system @@ -899,6 +899,18 @@ endif # Fortran Compiler dependent settings # +ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +FCOMMON_OPT += -i8 +endif +endif +ifeq ($(USE_OPENMP), 1) +FCOMMON_OPT += -openmp +endif +endif + ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee @@ -1207,6 +1219,8 @@ CCOMMON_OPT += -fPIC endif ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic +else ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -PIC else FCOMMON_OPT += -fPIC endif @@ -1465,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS) LAPACK_FPFLAGS := $(FPFLAGS) endif +ifeq ($(F_COMPILER),NAG) +LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +endif + LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 175db823d..5406494c9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -10,34 +10,46 @@ endif ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse3 endif +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mssse3 endif +endif ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse4.1 endif +endif ifndef OLDGCC ifdef HAVE_AVX CCOMMON_OPT += -mavx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx endif endif +endif ifndef NO_AVX2 ifdef HAVE_AVX2 CCOMMON_OPT += -mavx2 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx2 endif endif +endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=skylake-avx512 +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -59,9 +71,11 @@ ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=cooperlake endif endif +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables diff --git a/f_check b/f_check index fe947bf66..54f542eaf 100644 --- a/f_check +++ b/f_check @@ -34,7 +34,7 @@ if ($compiler eq "") { "pathf90", "pathf95", "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", "flang", "egfortran", - "ifort"); + "ifort", "nagfor"); OUTER: foreach $lists (@lists) { @@ -64,6 +64,9 @@ if ($compiler eq "") { if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; + if ($data eq "") { + $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; + } if ($data =~ /zhoge_/) { $bu = "_"; } @@ -133,8 +136,16 @@ if ($compiler eq "") { $openmp = "-openmp"; } + if ($data =~ /NAG/) { + $vendor = NAG; + $openmp = "-openmp"; + } + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; + if ($data eq "") { + $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; + } if ($data =~ / zho_ge__/) { $need2bu = 1; } @@ -222,6 +233,12 @@ if ($compiler eq "") { $openmp = "-fopenmp"; } + if ($compiler =~ /nagfor/) { + $vendor = NAG; + $bu = "_"; + $openmp = "-openmp"; + } + if ($vendor eq "") { $nofortran = 1; $compiler = "gfortran"; @@ -275,14 +292,20 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For nagfor + if ($?) { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } - if ($binary eq "") { $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } +if ( $vendor == NAG) { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $linker_L = ""; $linker_l = ""; $linker_a = ""; @@ -336,6 +359,7 @@ if ($link ne "") { if ( ($flags =~ /^\-l/) + && ($flags !~ /ibrary/) && ($flags !~ /gfortranbegin/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) @@ -352,6 +376,16 @@ if ($link ne "") { $linker_l .= $flags . " "; } + if ( $flags =~ /quickfit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /safefit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /thsafe.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + $linker_a .= $flags . " " if $flags =~ /\.a$/; } From 041a26fd79e56d9807a9ecd9486bd139fd062d6c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Mar 2021 11:52:29 +0100 Subject: [PATCH 139/681] Support compilation with nagfor --- ctest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 2a893cae8..15c83a907 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1) From 6ae7af78a38649c446e2b4cf310b48538f8a1db7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Mar 2021 11:53:51 +0100 Subject: [PATCH 140/681] Support compilation with nagfor --- test/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Makefile b/test/Makefile index 5f653414a..54fa60533 100644 --- a/test/Makefile +++ b/test/Makefile @@ -270,6 +270,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1) From 697e64bbb6651eef92bc910f46b6e1e7b58709d1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 11 Mar 2021 23:03:58 +0100 Subject: [PATCH 141/681] Fix syntax --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index 54f542eaf..20281ea4d 100644 --- a/f_check +++ b/f_check @@ -303,7 +303,7 @@ if (!$?) { } } -if ( $vendor == NAG) { +if ( $vendor eq "NAG") { $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; } $linker_L = ""; From 0934568d9cbc0e7a7d95c00fab807d95d1168bed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 12 Mar 2021 12:42:05 +0100 Subject: [PATCH 142/681] Move includes under the ifdef for compilers w/o intrinsics support --- kernel/x86_64/sgemm_direct_skylakex.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index aaadcf151..cc2ac5553 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,8 +1,11 @@ /* the direct sgemm code written by Arjan van der Ven */ + + +#if defined(SKYLAKEX) || defined (COOPERLAKE) + #include #include "common.h" -#if defined(SKYLAKEX) || defined (COOPERLAKE) /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, From 6726771645e32529d19c3c0ffc90f25784c2cc23 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Mar 2021 20:16:18 +0100 Subject: [PATCH 143/681] Support compilation with NAG fortran --- Makefile.arm64 | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index c3fe583e4..23362b4e5 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,28 +1,38 @@ ifneq ($(C_COMPILER), PGI) ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a endif +endif ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 endif +endif ifeq ($(CORE), CORTEXA57) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 endif +endif ifeq ($(CORE), CORTEXA72) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif +endif ifeq ($(CORE), CORTEXA73) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif +endif # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 @@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1) ifeq ($(GCCVERSIONGTEQ7), 1) ifeq ($(GCCVERSIONGTEQ9), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +endif else CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif +endif else CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif endif +endif ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=thunderx endif +endif ifeq ($(CORE), FALKOR) CCOMMON_OPT += -march=armv8-a -mtune=falkor +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=falkor endif +endif ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +endif ifeq ($(CORE), THUNDERX3T110) ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +endif else CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +endif ifeq ($(CORE), VORTEX) CCOMMON_OPT += -march=armv8.3-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a endif +endif ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif endif +endif From 34753eaebb8b2ddbc256e9e996c1fb315396a2a0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Mar 2021 17:28:43 +0100 Subject: [PATCH 144/681] Include common.h (and indirectly param.h) rather than just param.h to have BLASLONG available w/o circular dependencies --- getarch_2nd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch_2nd.c b/getarch_2nd.c index c390ef52c..53ecccf30 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -4,7 +4,7 @@ #else #include "config_kernel.h" #endif -#include "param.h" +#include "common.h" int main(int argc, char **argv) { From ecb4babcf45e402d6e75702446cedd7242fd2ef8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Mar 2021 17:36:51 +0100 Subject: [PATCH 145/681] remove inclusion of common.h again to avoid circular dependency --- param.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/param.h b/param.h index 508cbb2a5..a37743ef4 100644 --- a/param.h +++ b/param.h @@ -72,8 +72,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#include "common.h" - #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_MN 32 From e9d453b623ee23005604f0526aabbc2ab4128d6d Mon Sep 17 00:00:00 2001 From: xoviat Date: Sun, 14 Mar 2021 16:34:02 -0500 Subject: [PATCH 146/681] disable openmp --- appveyor.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 6197e85ab..c9b2fa3a1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -31,7 +31,6 @@ environment: matrix: - COMPILER: clang-cl WITH_FORTRAN: ON - USE_OPENMP: ON - COMPILER: clang-cl DYNAMIC_ARCH: ON WITH_FORTRAN: OFF From 186368ddc3775540c147b6300693ccc0bcac7597 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 Mar 2021 16:52:57 +0100 Subject: [PATCH 147/681] Fix compilation with CLANG --- driver/others/dynamic_power.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index b8e5840a3..d9c15b312 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -40,7 +40,14 @@ char *gotoblas_corename(void) { return corename[0]; } -#ifdef C_PGI +#if defined(__clang__) +static int __builtin_cpu_supports(char* arg) +{ + return 0; +} +#endif + +#if defined(C_PGI) || defined(__clang__) /* * NV HPC compilers do not yet implement __builtin_cpu_is(). * Fake a version here for use in the CPU detection code below. From 8cdf0825debb529c55d06a7da22de366b049c4f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 16 Mar 2021 21:20:05 +0100 Subject: [PATCH 148/681] Add workaround for older gcc on ppc64be not supporting casts in defines --- param.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/param.h b/param.h index a37743ef4..c41f75ec9 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H +#define LONGCAST (BLASLONG) +#if defined(__BYTE_ORDER__) +#if __GNUC__ < 9 +#undef LONGCAST +#define LONGCAST +#endif +#endif + #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_MN 32 @@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2345,7 +2353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2398,7 +2406,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 @@ -2437,7 +2445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 From 7888b5127c4e6a6ac457224583d931c29b3ec88e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Mar 2021 16:17:55 +0100 Subject: [PATCH 149/681] Update Changelog for 0.3.14 --- Changelog.txt | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index cbc7007ac..5662bc5c6 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,52 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.14 + 17-Mar-2021 + + common: + * Fixed a race condition on thread shutdown in non-OpenMP builds + * Fixed custom BUFFERSIZE option getting ignored in gmake builds + * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms + * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT + * Improved performance of OMATCOPY_RT across all platforms + * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl + * Fixed potential misreading of the GCC compiler version in the build scripts + * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) + * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) + + RISCV: + * Fixed compilation on RISCV (missing entry in getarch) + + POWER: + * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions + * Added support for compilation on FreeBSD/ppc64le + * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL + * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM + * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 + * Improved SCOPY and CCOPY performance on POWER10 + * Improved SGEMM and DGEMM performance on POWER10 + * Added support for compilation with the NVIDIA HPC compiler + + x86_64: + * Added an optimized bfloat16 GEMM kernel for Cooperlake + * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus + * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus + * Added support for compilation with the NAG Fortran compiler + * Fixed recognition of the AMD AOCC compiler + * Fixed compilation for DYNAMIC_ARCH with clang on Windows + * Added support for running the BLAS/CBLAS tests on Windows + * Fixed signatures of the tls callback functions for Windows x64 + * Fixed various issues with fma intrinsics support handling + + ARM: + * Added support for embedded Cortex M targets via a new option EMBEDDED + + ARMV8: + * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf + * Added support for the DYNAMIC_LIST option + * Added support for compilation with the NVIDIA HPC compiler + * Added support for compiling with the NAG Fortran compiler + ==================================================================== Version 0.3.13 12-Dec-2020 From 2663e44724737bad34abdb6aff770d2c9c2fb09e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Mar 2021 20:20:00 +0100 Subject: [PATCH 150/681] Update version to 0.3.14 for release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f34d5337..3107ef9a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 13.dev) +set(OpenBLAS_PATCH_VERSION 14) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 86de5f768b7013c6c6788bc3e12df122b8c53196 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Mar 2021 20:20:34 +0100 Subject: [PATCH 151/681] Update version to 0.3.14 for release --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index c68c20923..5a46bf6b0 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.13.dev +VERSION = 0.3.14 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From e10745445401dcfba3c3e15881006bc8bc14f4f7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Mar 2021 21:14:05 +0100 Subject: [PATCH 152/681] Update version to 0.3.14.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3107ef9a9..d0313c842 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 14) +set(OpenBLAS_PATCH_VERSION 14.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From d90ca75a6c243357f430f4276d798e9b21d2a464 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 17 Mar 2021 21:14:42 +0100 Subject: [PATCH 153/681] Update version to 0.3.14.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 5a46bf6b0..38d0161a3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.14 +VERSION = 0.3.14.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 3fd6ccdf7610014c11f4f5e82c3f9ce16a0945ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 Mar 2021 07:50:19 +0100 Subject: [PATCH 154/681] Include just the definition of BLASLONG rather than all of common.h --- getarch_2nd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/getarch_2nd.c b/getarch_2nd.c index 53ecccf30..dd1f83089 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -4,7 +4,15 @@ #else #include "config_kernel.h" #endif -#include "common.h" +#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#include "param.h" int main(int argc, char **argv) { From 7b294a99fde37f9657cdd7a261318e97e36fe351 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 Mar 2021 21:28:19 +0100 Subject: [PATCH 155/681] Move common.h back to the top of the file so that SKYLAKEX (from config.h) is defined in time --- kernel/x86_64/sgemm_direct_skylakex.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index cc2ac5553..2588289d1 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,10 +1,10 @@ /* the direct sgemm code written by Arjan van der Ven */ - +#include "common.h" #if defined(SKYLAKEX) || defined (COOPERLAKE) #include -#include "common.h" + /* * "Direct sgemm" code. This code operates directly on the inputs and outputs @@ -472,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s } } #else -#include "common.h" + void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) {} #endif From 0f5e86a0d99d1432bf1f5919992f395147d4f72c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 18 Mar 2021 21:53:50 +0100 Subject: [PATCH 156/681] Remove premature entry for DOMATCOPY_RT --- kernel/x86_64/KERNEL | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 5da79cc3f..bea7036c2 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -491,4 +491,3 @@ SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c SOMATCOPY_RT = omatcopy_rt.c -DOMATCOPY_RT = omatcopy_rt.c From d3555d2e505acf3b1a4f059d0a78177d8eb56a18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 19 Mar 2021 11:44:31 +0100 Subject: [PATCH 157/681] Add workaround for LAPACK test failures with the NVIDIA HPC compiler --- kernel/power/KERNEL.POWER8 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c2f4cd204..2b8e65948 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -242,8 +242,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c From ef85c2247419647212de39228433e292a8a18625 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 19 Mar 2021 11:46:25 +0100 Subject: [PATCH 158/681] Add workaround for LAPACK test failures with the NVIDIA HPC compiler --- kernel/power/KERNEL.POWER9 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 2bd2516de..b6b102b3e 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -166,8 +166,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c From 86c5a0013fdbd87832cdd5f0a3446aac0aa43804 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 19 Mar 2021 11:47:58 +0100 Subject: [PATCH 159/681] Add workaround for LAPACK testsuite failures with the NVIDIA HPC compiler --- kernel/power/KERNEL.POWER10 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 1cf7b0b7c..594b1a35a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -169,8 +169,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c From 198adea9611e5b384166fea9ce607fb1cbcad792 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Fri, 19 Mar 2021 10:05:23 -0400 Subject: [PATCH 160/681] Changed default P/Q values for CGEMM and ZGEMM (Power10 only) --- param.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/param.h b/param.h index c41f75ec9..a35ce69bd 100644 --- a/param.h +++ b/param.h @@ -2466,13 +2466,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 384 -#define CGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 1026 -#define ZGEMM_DEFAULT_Q 1026 +#define CGEMM_DEFAULT_Q 384 +#define ZGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 From 292a0aed66dd049825af65b6dd75a26cfb423064 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 06:55:14 +0100 Subject: [PATCH 161/681] Fix xcode12 build and add OSX/OpenMP --- .travis.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index bde0e202d..47064672a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -224,12 +224,21 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update - - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - + - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 FC=gfortran-10" + + - <<: *test-macos + osx_image: xcode12 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + # - <<: *test-macos # osx_image: xcode10 # env: From 70b89a6205d3c4568888c46559d88c642dd34bec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 07:50:35 +0100 Subject: [PATCH 162/681] Add OSX build to Azure --- azure-pipelines.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 639cb3558..49e53cbda 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -68,4 +68,10 @@ jobs: dir openblas_utest.exe - +- job: OSX_OpenMP + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 + From dbb33f412f7687d047153a9c2dd6bb0a7d2c11de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 08:30:48 +0100 Subject: [PATCH 163/681] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 49e53cbda..5040ae697 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -73,5 +73,6 @@ jobs: vmImage: 'macOS-10.15' steps: - script: | - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=clang + From e6664ec2c9cbc584faf3f4fe15cfe706767812d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 08:41:48 +0100 Subject: [PATCH 164/681] Update azure-pipelines.yml --- azure-pipelines.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5040ae697..2933fa358 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -73,6 +73,9 @@ jobs: vmImage: 'macOS-10.15' steps: - script: | - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=clang + brew update + brew install gcc@10 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc + From 9dc0bfd617f94d5cb54ab52c428843999e8ea98e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 08:54:30 +0100 Subject: [PATCH 165/681] Update azure-pipelines.yml --- azure-pipelines.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2933fa358..cd3f7943f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,8 +74,7 @@ jobs: steps: - script: | brew update - brew install gcc@10 - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 From e69b0b177101cd883768820fe639a4fb14466029 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 10:34:24 +0100 Subject: [PATCH 166/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cd3f7943f..fdf184b22 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,7 +74,7 @@ jobs: steps: - script: | brew update - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 FC=gfortran-10 From 8fd694c18fe539c3dc2d5ff4965afed70ade4123 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 24 Mar 2021 10:36:29 +0100 Subject: [PATCH 167/681] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 47064672a..2a221e3bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -227,7 +227,7 @@ matrix: script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 FC=gfortran-10" + - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" - <<: *test-macos osx_image: xcode12 From d57c681a6df3b40cc17747338a2b0f657cfc05fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Mar 2021 22:29:29 +0100 Subject: [PATCH 168/681] Fix compilation on older OSX versions --- benchmark/bench.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmark/bench.h b/benchmark/bench.h index 83de8ab2b..c03d72bef 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -3,6 +3,8 @@ #include #ifdef __CYGWIN32__ #include +#elif defined(__APPLE__) +#include #endif #include "common.h" From d2bda3b56a06a30623a840408ee8874d54d1058c Mon Sep 17 00:00:00 2001 From: CodesWithWolves Date: Wed, 31 Mar 2021 15:38:07 -0400 Subject: [PATCH 169/681] Remove Unnecessary/Erroneous Reads In sgemm_tcopy_16.S COPY1x8 Macro There appears to have been some code leak when copying from the COPY2x8 macro above where we're reading 8 bytes into d4-d7 directly after reading 4 bytes into s4-s7. These 32 bytes in d4-7 are unused and can possibly overrun the boundary of allocated memory -- Valgrind detected this which is what dragged my attention to it for a 128,1 copy. Additionally, there is no need to update the addresses stored in A0-A7 as the only possible paths after running this macro will overwrite A0-7 if looping to the next 8 rows, or overwrite A0-3 if moving to 4 rows -- in which case A4-7 are unused. --- kernel/arm64/sgemm_tcopy_16.S | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 12b80bdca..46198b3a2 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -270,11 +270,6 @@ All rights reserved. ldr s1, [A02] ldr s2, [A03] ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 stp s0, s1, [B04] add B04, B04, #8 @@ -285,11 +280,6 @@ All rights reserved. ldr s5, [A06] ldr s6, [A07] ldr s7, [A08] - - ldr d4, [A05], #8 - ldr d5, [A06], #8 - ldr d6, [A07], #8 - ldr d7, [A08], #8 stp s4, s5, [B04] add B04, B04, #8 From 2dbcddd83d45d32191d8e409ac3eca5672128bca Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 31 Mar 2021 21:32:42 -0500 Subject: [PATCH 170/681] POWER10: Adding check for little endian This patch makes sure that recent POWER10 patches are used only for little endian. --- kernel/power/cdot.c | 4 ++-- kernel/power/cswap.c | 4 +++- kernel/power/dasum.c | 6 ++++-- kernel/power/drot.c | 6 ++++-- kernel/power/dscal.c | 8 +++++--- kernel/power/dswap.c | 6 ++++-- kernel/power/sasum.c | 6 ++++-- kernel/power/srot.c | 6 ++++-- kernel/power/sscal.c | 8 +++++--- kernel/power/sswap.c | 6 ++++-- kernel/power/zscal.c | 6 +++++- kernel/power/zswap.c | 4 +++- 12 files changed, 47 insertions(+), 23 deletions(-) diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index c53fe0c02..b9e2d2ce5 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cdot_microk_power10.c" #else #ifndef HAVE_KERNEL_8 @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) BLASLONG n1 = n & -16; #else BLASLONG n1 = n & -8; diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 4d9b9ccd6..c2fde1c44 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cswap_microk_power10.c" +#elif defined(POWER10) +#include "cswap_microk_power8.c" #endif #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 0cdec3292..7507621cf 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "dasum_microk_power10.c" +#elif defined(POWER10) +#include "dasum_microk_power8.c" #endif #endif @@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 94d9d95a3..3229878e4 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "drot_microk_power10.c" +#elif defined(POWER10) +#include "drot_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 96c4e51bc..32c39a8f4 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "dscal_microk_power10.c" +#elif defined(POWER10) +#include "dscal_microk_power8.c" #endif #endif @@ -102,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -136,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 9e6229c6a..12476965b 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "swap_microk_power10.c" +#elif defined(POWER10) +#include "dswap_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index af692a7fa..991d27508 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "sasum_microk_power10.c" +#elif defined(POWER10) +#include "sasum_microk_power8.c" #endif #endif @@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 3e4f93e2a..5a0d4b12e 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "srot_microk_power10.c" +#elif defined(POWER10) +#include "srot_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 65572a8c1..9ae9ccab8 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "sscal_microk_power10.c" +#elif defined(POWER10) +#include "sscal_microk_power8.c" #endif #endif @@ -104,7 +106,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; @@ -138,7 +140,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index dd249fd36..955ed02f0 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "swap_microk_power10.c" +#elif defined(POWER10) +#include "sswap_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 64 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 0068138e8..59ddc149f 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -43,12 +43,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(DOUBLE) #include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif +#elif defined(POWER10) +#if defined(DOUBLE) +#include "zscal_microk_power8.c" +#endif #endif #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 6cd3d9664..908802b71 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cswap_microk_power10.c" +#elif defined(POWER10) +#include "zswap_microk_power8.c" #endif #endif From 081d5ae9717f41a5639d9a83c4fd10614c92bc0f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Apr 2021 22:11:14 +0200 Subject: [PATCH 171/681] Fix typo and potentially undefined variables (copies fixes made in Reference-LAPACK PR 477 after the initial cherrypick) --- lapack-netlib/SRC/chgeqz.f | 7 +++++-- lapack-netlib/SRC/zhgeqz.f | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 4725e7169..bcf5acd0b 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -319,14 +319,14 @@ REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -351,6 +351,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -364,6 +365,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -377,6 +379,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b28ae47a4..960244727 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -319,7 +319,7 @@ DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. @@ -352,6 +352,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -365,6 +366,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -378,6 +380,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * From d393f1923f1efa8ddef2079bab6e8c95def64d64 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 3 Apr 2021 22:18:15 +0200 Subject: [PATCH 172/681] Fix spillover of host-specific build flags into the shared part of DYNAMIC_ARCH builds with gmake for #3139 --- Makefile.x86_64 | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 5406494c9..7f5f8d0be 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,20 @@ endif endif endif + +ifndef DYNAMIC_ARCH +define ADD_CPUFLAGS +1 +endef +else +ifdef TARGET_CORE +define ADD_CPUFLAGS +1 +endef +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 ifneq ($(F_COMPILER), NAG) @@ -44,7 +58,6 @@ endif endif ifeq ($(CORE), SKYLAKEX) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 ifneq ($(F_COMPILER), NAG) @@ -62,10 +75,8 @@ endif endif endif endif -endif ifeq ($(CORE), COOPERLAKE) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 @@ -88,7 +99,6 @@ endif endif endif endif -endif ifdef HAVE_AVX2 ifndef NO_AVX2 @@ -120,6 +130,7 @@ endif endif endif +endif ifeq ($(OSNAME), Interix) From 1ae607beca84ff850e3c5e435a57002486a797d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Apr 2021 12:31:22 +0200 Subject: [PATCH 173/681] Update Makefile.x86_64 --- Makefile.x86_64 | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 7f5f8d0be..f62ab9e5e 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -10,14 +10,10 @@ endif ifndef DYNAMIC_ARCH -define ADD_CPUFLAGS -1 -endef +ADD_CPUFLAGS = 1 else ifdef TARGET_CORE -define ADD_CPUFLAGS -1 -endef +ADD_CPUFLAGS = 1 endif endif From 5332cbae18174720aa9ea511f774d11975e7b4bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Apr 2021 23:12:17 +0200 Subject: [PATCH 174/681] Avoid adding host-specific cpuflags to the common part of DYNAMIC_ARCH builds --- Makefile.x86 | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Makefile.x86 b/Makefile.x86 index 0e27264d8..893379c33 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,10 +1,21 @@ # COMPILER_PREFIX = mingw32- +ifndef DYNAMIC_ARCH +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE CCOMMON_OPT += -msse +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse endif - +endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 From 725432efaabfcd2c0358a19a91a4ac9f629a2b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E9=9B=A8=E5=9F=B9?= Date: Wed, 7 Apr 2021 00:10:41 +0800 Subject: [PATCH 175/681] pass NO_AVX512 macro def --- cmake/system.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index eee429113..d6c71b774 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -299,6 +299,10 @@ if (NO_AVX2) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") endif () +if (NO_AVX512) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif () + if (USE_THREAD) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 From 2dfb24730d5f6a312b91af4ae47d553e05b2cb17 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Apr 2021 19:58:32 +0200 Subject: [PATCH 176/681] Use "old" compute(24) function with clang due to register limitations --- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index f3d614242..2db8b2fea 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#if defined(__clang__) + for(;n_count>23;n_count-=24) COMPUTE(24) +#else for(;n_count>23;n_count-=24) COMPUTE_n24 +#endif for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) for(;n_count>11;n_count-=12) COMPUTE(12) From 558724e99f09353ee31b2bbe5fd09beb5522dd53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Apr 2021 10:03:31 +0200 Subject: [PATCH 177/681] Fix implicit typing of new variable TWO --- lapack-netlib/SRC/slanv2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index e678305f2..375645b75 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - REAL ZERO, HALF, ONE + REAL ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ TWO = 2.0E+0 ) REAL MULTPL From e96f5e3c6500f2fe58c6e8d19d1e3db3e71fa893 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 Apr 2021 10:04:15 +0200 Subject: [PATCH 178/681] Fix implicit typing of new variable TWO --- lapack-netlib/SRC/dlanv2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index 61b016f16..1c277c6bb 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, HALF, ONE + DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL From 55bb9f639a9100addad028242b9ef1daf5f17a6a Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sat, 10 Apr 2021 19:00:24 -0500 Subject: [PATCH 179/681] POWER10: Optimized zgemv This patch makes use of Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1 for zgemv_n and zgemv_t. --- kernel/power/KERNEL.POWER10 | 2 +- kernel/power/zgemv_n_power10.c | 1102 ++++++++++++++++++++++++++++++++ kernel/power/zgemv_t_4.c | 129 ++++ 3 files changed, 1232 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zgemv_n_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 594b1a35a..873653f1e 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -186,7 +186,7 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t_power10.c diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c new file mode 100644 index 000000000..f5bb8d70e --- /dev/null +++ b/kernel/power/zgemv_n_power10.c @@ -0,0 +1,1102 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif +#endif + +// +#define NBMAX 4096 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] - result[1][1]; \ + result[0][1] = result[0][1] + result[1][0]; \ + result[1][0] = result[2][0] - result[3][1]; \ + result[1][1] = result[2][1] + result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#else +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] + result[1][1]; \ + result[0][1] = result[0][1] - result[1][0]; \ + result[1][0] = result[2][0] + result[3][1]; \ + result[1][1] = result[2][1] - result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#endif + +static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector_quad acc0, acc1, acc2, acc3; + v4sf_t result[4]; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + + register __vector double vx0_r = {x[0], x[1]}; + register __vector double vx1_r = {x[2], x[3]}; + register __vector double vx2_r = {x[4], x[5]}; + register __vector double vx3_r = {x[6], x[7]}; + register __vector double vx4_r = {x[8], x[9]}; + register __vector double vx5_r = {x[10], x[11]}; + register __vector double vx6_r = {x[12], x[13]}; + register __vector double vx7_r = {x[14], x[15]}; + __vector_pair *Va0, *Va1, *Va2, *Va3; + __vector_pair *Va4, *Va5, *Va6, *Va7; + BLASLONG i = 0, i2 = 0; + v4sf_t *rowC; + BLASLONG tmp = (n / 8) * 8; + for (i = 0; i < tmp; i += 8) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + __builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + SAVE_RESULT(&acc2, 8); + SAVE_RESULT(&acc3, 12); + } + while (i < n) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + i += 4; + } +} +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; +#endif + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + + register __vector double vy_0; + register __vector double va0; + register __vector double va1; + register __vector double va2; + register __vector double va3; + register __vector double vy_1; + register __vector double va0_1; + register __vector double va1_1; + register __vector double va2_1; + register __vector double va3_1; + register __vector double vy_2; + register __vector double va0_2; + register __vector double va1_2; + register __vector double va2_2; + register __vector double va3_2; + register __vector double vy_3; + register __vector double va0_3; + register __vector double va1_3; + register __vector double va2_3; + register __vector double va3_3; + + BLASLONG i = 0; + while (i < n) { + + vy_0 = vy[i]; + va0 = vptr_a0[i]; + va1 = vptr_a1[i]; + va2 = vptr_a2[i]; + va3 = vptr_a3[i]; + + vy_1 = vy[i + 1]; + va0_1 = vptr_a0[i + 1]; + va1_1 = vptr_a1[i + 1]; + va2_1 = vptr_a2[i + 1]; + va3_1 = vptr_a3[i + 1]; + + vy_2 = vy[i + 2]; + va0_2 = vptr_a0[i + 2]; + va1_2 = vptr_a1[i + 2]; + va2_2 = vptr_a2[i + 2]; + va3_2 = vptr_a3[i + 2]; + + vy_3 = vy[i + 3]; + va0_3 = vptr_a0[i + 3]; + va1_3 = vptr_a1[i + 3]; + va2_3 = vptr_a2[i + 3]; + va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_xxpermdi(va2, va2, 2); + va2_1 = vec_xxpermdi(va2_1, va2_1, 2); + + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + + va2_2 = vec_xxpermdi(va2_2, va2_2, 2); + va2_3 = vec_xxpermdi(va2_3, va2_3, 2); + + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_xxpermdi(va3, va3, 2); + va3_1 = vec_xxpermdi(va3_1, va3_1, 2); + + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + + va3_2 = vec_xxpermdi(va3_2, va3_2, 2); + va3_3 = vec_xxpermdi(va3_3, va3_3, 2); + + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + + i += 4; + + + } + +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; +#endif + + } +} + +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + +#else + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; +#endif + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t = 0; + BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[dest_t]; + register __vector double vy_1 = vptr_y1[dest_t]; + register __vector double vy_2 = vptr_y2[dest_t]; + register __vector double vy_3 = vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t += add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[i]; + register __vector double vy_1 = vptr_y[i + 1]; + register __vector double vy_2 = vptr_y[i + 2]; + register __vector double vy_3 = vptr_y[i + 3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; +} + +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + if (inc_dest != 2) { + + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; +#else + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; +#endif + + *dest += temp_r; + *(dest + 1) += temp_i; + + src += 2; + dest += inc_dest; + } + return; + } + + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif + + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; + } + return; +} +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT xbuffer[16] __attribute__((aligned(16))); + FLOAT *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 8; + n2 = n % 8; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 3; + x_ptr += 16; + } + if (n2 & 4) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[8] = x_ptr[0]; + xbuffer[9] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[10] = x_ptr[0]; + xbuffer[11] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[12] = x_ptr[0]; + xbuffer[13] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[14] = x_ptr[0]; + xbuffer[15] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 3; + } + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; +#endif + + a_ptr += 4; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += 2; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + + if (lda == 4 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; +#endif + + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += 4; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += 6; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; +#endif + return (0); + } + + return (0); +} + diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 956d75ffc..d3bf60ca7 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector_quad acc0, acc1, acc2, acc3;; + __vector_quad acc4, acc5, acc6, acc7; + v4sf_t result[4]; + __vector_pair *Va0, *Va1, *Va2, *Va3; + i = 0; + n = n << 1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + while (i < n) { + + vec_t *rx = (vec_t *) & x[i]; + Va0 = ((__vector_pair*)((void*)&a0[i])); + Va1 = ((__vector_pair*)((void*)&a1[i])); + Va2 = ((__vector_pair*)((void*)&a2[i])); + Va3 = ((__vector_pair*)((void*)&a3[i])); + + __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]); + i += 8; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] - result[1][1]; + register FLOAT temp_i0 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] - result[3][1]; + temp_i0 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] - result[1][1]; + register FLOAT temp_i1 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] - result[3][1]; + temp_i1 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] - result[1][1]; + register FLOAT temp_i2 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] - result[3][1]; + temp_i2 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] - result[1][1]; + register FLOAT temp_i3 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] - result[3][1]; + temp_i3 += result[2][1] + result[3][0]; +#else + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] + result[1][1]; + register FLOAT temp_i0 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] + result[3][1]; + temp_i0 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] + result[1][1]; + register FLOAT temp_i1 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] + result[3][1]; + temp_i1 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] + result[1][1]; + register FLOAT temp_i2 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] + result[3][1]; + temp_i2 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] + result[1][1]; + register FLOAT temp_i3 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] + result[3][1]; + temp_i3 += result[2][1] - result[3][0]; +#endif +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; +#endif +} +#else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; FLOAT *a0, *a1, *a2, *a3; @@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA #endif } +#endif #else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { From 2379abaa5e6d559d58735f43e10d68c192724ea2 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 13 Apr 2021 22:30:06 -0500 Subject: [PATCH 180/681] POWER10: Improve dgemm performance This patch uses vector pair pointer for input load operation which helps to generate power10 lxvp instructions. --- kernel/power/dgemm_kernel_power10.c | 48 +++++++++++------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index e918e61c3..cdd846891 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); From 623d580b4cabaf79778fc1d1d968780405ba8399 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Apr 2021 10:27:32 +0200 Subject: [PATCH 181/681] Restore __volatile__ keyword --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 6c68ba98a..0b623c3ac 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__ ("mrs %0, "#id : "=r" (var)); \ + __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { From 751d127d7c25b7756c009397c19e3dc6e1165b8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Apr 2021 22:26:34 +0200 Subject: [PATCH 182/681] Include cblas_test.h to achieve int/long size change with INTERFACE64 --- ctest/constant.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ctest/constant.c b/ctest/constant.c index 861d70bcc..5a26a4dde 100644 --- a/ctest/constant.c +++ b/ctest/constant.c @@ -1,3 +1,4 @@ +#include "cblas_test.h" int CBLAS_CallFromC; int RowMajorStrg; From 94a5a1f0f1571d377137cbcf7b6249d559541559 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 Apr 2021 22:27:08 +0200 Subject: [PATCH 183/681] Add OSX build variations to Azure CI --- azure-pipelines.yml | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fdf184b22..68e48437f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -74,7 +74,27 @@ jobs: steps: - script: | brew update - make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 FC=gfortran-10 - - - + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 + +- job: OSX_GCC_Nothreads + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 + +- job: OSX_OpenMP_Clang + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + brew tap LouisBrunner/valgrind + brew install --HEAD LouisBrunner/valgrind/valgrind + make TARGET=SANDYBRIDGE NO_AVX512=1 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 DYNAMIC_LIST=SANDYBRIDGE DEBUG=1 NO_PARALLEL_MAKE=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 + cd ctest; OMP_NUM_THREADS=1 valgrind ./xscblat2 Date: Thu, 22 Apr 2021 02:11:20 +0200 Subject: [PATCH 184/681] Add mixed clang/ifort build on OSX to Azure CI (#3185) * Add mixed clang/ifort build on OSX to the Azure CI config based on https://github.com/oneapi-src/oneapi-ci (and remove debugging tools from the clang+gfortran job) * Remove extraneous libgfortran dependency of ifort builds * remove FEXTRALIB from link line of shared library as ifort keeps track of dependencies (and they are different for a .dylib than what f_check got for an executable) --- azure-pipelines.yml | 49 ++++++++++++++++++++++++++++++++++++++++----- exports/Makefile | 4 ++++ f_check | 4 ---- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 68e48437f..56a3fd4ae 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,7 +4,15 @@ trigger: branches: include: - develop - +resources: + containers: + - container: oneapi-hpckit + image: intel/oneapi-hpckit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + - container: oneapi-basekit + image: intel/oneapi-basekit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + jobs: # manylinux1 is useful to test because the # standard Docker container uses an old version @@ -94,7 +102,38 @@ jobs: - script: | brew update brew install llvm libomp - brew tap LouisBrunner/valgrind - brew install --HEAD LouisBrunner/valgrind/valgrind - make TARGET=SANDYBRIDGE NO_AVX512=1 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 DYNAMIC_LIST=SANDYBRIDGE DEBUG=1 NO_PARALLEL_MAKE=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 - cd ctest; OMP_NUM_THREADS=1 valgrind ./xscblat2 Date: Mon, 26 Apr 2021 21:55:30 +0200 Subject: [PATCH 185/681] replace spurious avx512 requirement with fma check --- kernel/x86_64/drot_microk_haswell-2.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c index 72a87696e..cc5949b1a 100644 --- a/kernel/x86_64/drot_microk_haswell-2.c +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -1,6 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) - +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_DROT_KERNEL 1 #include From 0608bc5d82d780cf81f27e0297af37814cfd73dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 26 Apr 2021 22:32:23 +0200 Subject: [PATCH 186/681] delay creation of the softlink until after the library has been created --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index de0735c4a..555d1c467 100644 --- a/Makefile +++ b/Makefile @@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1) $(error OpenBLAS: neither static nor shared are enabled.) endif endif - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -196,6 +195,7 @@ endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif + @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd prof : prof_blas prof_lapack From ceb44bef1485bb664cee97113f8486d74ac59443 Mon Sep 17 00:00:00 2001 From: damonyu Date: Tue, 27 Apr 2021 11:12:29 +0800 Subject: [PATCH 187/681] update the intrinsic api to the offical name. --- kernel/riscv64/amax_vector.c | 97 +++++++++++-------- kernel/riscv64/amin_vector.c | 98 +++++++++++-------- kernel/riscv64/asum_vector.c | 88 +++++++++-------- kernel/riscv64/axpby_vector.c | 86 ++++++++--------- kernel/riscv64/axpy_vector.c | 46 +++++---- kernel/riscv64/copy_vector.c | 38 ++++---- kernel/riscv64/dot_vector.c | 97 ++++++++++--------- kernel/riscv64/gemv_n_vector.c | 38 ++++---- kernel/riscv64/gemv_t_vector.c | 84 ++++++++-------- kernel/riscv64/iamax_vector.c | 133 +++++++++++++------------ kernel/riscv64/iamin_vector.c | 133 +++++++++++++------------ kernel/riscv64/imax_vector.c | 117 +++++++++++----------- kernel/riscv64/imin_vector.c | 117 +++++++++++----------- kernel/riscv64/izamax_vector.c | 112 +++++++++++---------- kernel/riscv64/izamin_vector.c | 114 ++++++++++++---------- kernel/riscv64/max_vector.c | 72 +++++++------- kernel/riscv64/min_vector.c | 72 +++++++------- kernel/riscv64/nrm2_vector.c | 135 ++++++++++++++------------ kernel/riscv64/nrm2_vector_dot.c | 75 +++++++------- kernel/riscv64/rot_vector.c | 56 +++++------ kernel/riscv64/scal_vector.c | 48 ++++----- kernel/riscv64/swap_vector.c | 44 ++++----- kernel/riscv64/symv_L_vector.c | 112 ++++++++++----------- kernel/riscv64/symv_U_vector.c | 116 +++++++++++----------- kernel/riscv64/zamax_vector.c | 76 ++++++++------- kernel/riscv64/zamin_vector.c | 77 ++++++++------- kernel/riscv64/zasum_vector.c | 90 +++++++++-------- kernel/riscv64/zaxpby_vector.c | 54 +++++------ kernel/riscv64/zaxpy_vector.c | 30 +++--- kernel/riscv64/zcopy_vector.c | 22 ++--- kernel/riscv64/zdot_vector.c | 78 ++++++++------- kernel/riscv64/zgemv_n_vector.c | 38 ++++---- kernel/riscv64/zgemv_t_vector.c | 69 +++++++------ kernel/riscv64/zhemv_LM_vector.c | 79 ++++++++------- kernel/riscv64/zhemv_UV_vector.c | 79 ++++++++------- kernel/riscv64/znrm2_vector.c | 161 ++++++++++++++++--------------- kernel/riscv64/zrot_vector.c | 46 ++++----- kernel/riscv64/zscal_vector.c | 44 ++++----- kernel/riscv64/zswap_vector.c | 36 +++---- 39 files changed, 1628 insertions(+), 1479 deletions(-) diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b6aec131e..5312f9ef0 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_zero; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_zero = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; @@ -162,6 +175,7 @@ asm volatile( //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -170,6 +184,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -185,6 +200,7 @@ asm volatile( //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -193,6 +209,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -205,17 +222,17 @@ asm volatile( j += gvl*2; ix += inc_xv*2; } - v0 = VFMVVF_FLOAT(0, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 53243ad56..ae2867ef8 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); MASK_T mask0, mask1; - FLOAT zero = 0.0; + FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int stride_y = inc_y * sizeof(FLOAT); @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } } return(dot); diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bd4d23eae..32ca8618b 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va0, va1, vy0, vy1; unsigned int gvl = 0; if(inc_y == 1){ - gvl = vsetvli(m, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m); if(gvl <= m/2){ for(k=0,j=0; k maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 608f19a00..5bcffece5 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index 44af7101b..42705f5de 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index e6e0e9f9f..3afa74dd6 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -113,26 +122,24 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -143,7 +150,7 @@ asm volatile( } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -154,7 +161,7 @@ asm volatile( //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -175,27 +182,25 @@ asm volatile( #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 62c95d973..ddb5eabde 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif #define RVV_M RVV_M8 @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-1, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -119,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -143,7 +154,7 @@ asm volatile( //index where element greater than v_max mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); + v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -163,7 +174,7 @@ asm volatile( :"v0"); #endif */ - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); @@ -171,19 +182,19 @@ asm volatile( ix += inc_xv; } vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask0,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -206,7 +217,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -227,9 +238,8 @@ asm volatile( #endif */ v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - FLOAT cur_maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 38eccf1b5..6e328dc31 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + + gvl = VSETVL(n); v_min_index = VMVVX_UINT(0, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -120,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -144,7 +154,7 @@ asm volatile( //index where element less than v_min mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); + v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -164,27 +174,26 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, gvl); j += gvl; ix += inc_xv; } - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask0,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -207,7 +216,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -228,9 +237,8 @@ asm volatile( #endif */ v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - FLOAT cur_minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 4ef75452d..0fc59b74c 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 83c965bfa..8223fa87a 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ va = VLEV_FLOAT(&a_ptr[i], gvl); @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); inc_xv = inc_x * gvl; for(k = 0; k < len / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 29e0e4b65..7229a48b1 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VFMULVV_FLOAT vfmul_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VFMULVV_FLOAT vfmul_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT temp2; FLOAT *a_ptr = a; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA temp2 = 0.0; if(j > 0){ i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ vy = VLEV_FLOAT(&y[i], gvl); @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ ix = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix = 0; iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index a6c742b14..5cd65b225 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max = VFMVVF_FLOAT(0, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i maxf) - maxf = v_max[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; } return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 44a7cf1dc..9d567b3da 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 6fe12c76c..40cd9cd64 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B FLOAT temp_r2, temp_i2; FLOAT *a_ptr = a; unsigned int gvl = 0; - + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B ia = 0; i = 0; if(j > 0){ - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += temp_r1 * a_ptr[ja]; diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index b0ebfa5f4..5ac62eb80 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 #define ABS fabsf -#define MASK_T e32xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 -#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 -#define VMFIRSTM vmfirstm_e32xm4 -#define VFDIVVF_FLOAT vfdivvf_float32xm4 -#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#define MASK_T vbool8_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 +#define VMFIRSTM vmfirst_m_b8 +#define VFDIVVF_FLOAT vfdiv_vf_f32m4 +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 #define ABS fabs -#define MASK_T e64xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 -#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 -#define VMFIRSTM vmfirstm_e64xm4 -#define VFDIVVF_FLOAT vfdivvf_float64xm4 -#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#define MASK_T vbool16_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 +#define VMFIRSTM vmfirst_m_b16 +#define VFDIVVF_FLOAT vfdiv_vf_f64m4 +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T vr, v0, v_zero; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; MASK_T mask; BLASLONG index = 0; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); vr = VFMVVF_FLOAT(0, gvl); v_zero = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i Date: Tue, 27 Apr 2021 12:52:49 +0200 Subject: [PATCH 188/681] Apply fixes from Reference-LAPACK PR538 --- lapack-netlib/TESTING/LIN/cdrvgex.f | 7 ++++--- lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f | 10 +++++----- lapack-netlib/TESTING/LIN/cdrvrfp.f | 8 ++++---- lapack-netlib/TESTING/LIN/ddrvrfp.f | 3 +-- lapack-netlib/TESTING/LIN/sdrvrfp.f | 2 +- lapack-netlib/TESTING/LIN/zdrvgex.f | 7 ++++--- lapack-netlib/TESTING/LIN/zdrvhex.f | 8 ++++---- lapack-netlib/TESTING/LIN/zdrvpox.f | 8 ++++---- lapack-netlib/TESTING/LIN/zdrvrfp.f | 2 +- lapack-netlib/TESTING/LIN/zdrvsyx.f | 8 ++++---- lapack-netlib/TESTING/LIN/zerrvxx.f | 2 +- 11 files changed, 33 insertions(+), 32 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cdrvgex.f b/lapack-netlib/TESTING/LIN/cdrvgex.f index 51fc84899..9b075908f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgex.f +++ b/lapack-netlib/TESTING/LIN/cdrvgex.f @@ -707,9 +707,10 @@ CALL CLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL CLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL CLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL CLASET( 'Full', N, N, CMPLX( ZERO ), + $ CMPLX( ZERO ), AFAC, LDA ) + CALL CLASET( 'Full', N, NRHS, CMPLX( ZERO ), + $ CMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 32be41f64..959258e1f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -449,11 +449,11 @@ * Reconstruct matrix from factors and compute * residual. * -c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, -c $ IWORK, AINV, LDA, RWORK, -c $ RESULT( 2 ) ) -c NT = 2 - NT = 1 +c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, +c $ IWORK, AINV, LDA, RWORK, +c $ RESULT( 2 ) ) +c NT = 2 + NT = 1 * * Print information about the tests that did not pass * the threshold. diff --git a/lapack-netlib/TESTING/LIN/cdrvrfp.f b/lapack-netlib/TESTING/LIN/cdrvrfp.f index a57688f83..362a0e7cb 100644 --- a/lapack-netlib/TESTING/LIN/cdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/cdrvrfp.f @@ -449,19 +449,19 @@ * Form the inverse of A. * CALL CPOTRI( UPLO, N, A, LDA, INFO ) + + IF ( N .NE. 0 ) THEN * -* Compute the 1-norm condition number of A. +* Compute the 1-norm condition number of A. * - IF ( N .NE. 0 ) THEN AINVNM = CLANHE( '1', UPLO, N, A, LDA, + S_WORK_CLANHE ) RCONDC = ( ONE / ANORM ) / AINVNM * * Restore the matrix A. * - CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) + CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) END IF - * END IF * diff --git a/lapack-netlib/TESTING/LIN/ddrvrfp.f b/lapack-netlib/TESTING/LIN/ddrvrfp.f index d67cf6713..18ccbdfc4 100644 --- a/lapack-netlib/TESTING/LIN/ddrvrfp.f +++ b/lapack-netlib/TESTING/LIN/ddrvrfp.f @@ -443,8 +443,7 @@ * CALL DPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN - + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/sdrvrfp.f b/lapack-netlib/TESTING/LIN/sdrvrfp.f index 4b022bcfb..c0eb4d564 100644 --- a/lapack-netlib/TESTING/LIN/sdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/sdrvrfp.f @@ -443,7 +443,7 @@ * CALL SPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/zdrvgex.f b/lapack-netlib/TESTING/LIN/zdrvgex.f index cdfa10727..1b784d31b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgex.f +++ b/lapack-netlib/TESTING/LIN/zdrvgex.f @@ -707,9 +707,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL ZLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL ZLASET( 'Full', N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvhex.f b/lapack-netlib/TESTING/LIN/zdrvhex.f index 3c0dfbfe4..527114508 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhex.f +++ b/lapack-netlib/TESTING/LIN/zdrvhex.f @@ -599,10 +599,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZHESVXX. diff --git a/lapack-netlib/TESTING/LIN/zdrvpox.f b/lapack-netlib/TESTING/LIN/zdrvpox.f index 260d8c1f2..0bc2c89d8 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpox.f +++ b/lapack-netlib/TESTING/LIN/zdrvpox.f @@ -611,10 +611,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT='F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvrfp.f b/lapack-netlib/TESTING/LIN/zdrvrfp.f index c7be7da03..b299a487b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/zdrvrfp.f @@ -450,7 +450,7 @@ * CALL ZPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/zdrvsyx.f b/lapack-netlib/TESTING/LIN/zdrvsyx.f index 9431cd692..e4556f150 100644 --- a/lapack-netlib/TESTING/LIN/zdrvsyx.f +++ b/lapack-netlib/TESTING/LIN/zdrvsyx.f @@ -605,10 +605,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZSYSVXX. diff --git a/lapack-netlib/TESTING/LIN/zerrvxx.f b/lapack-netlib/TESTING/LIN/zerrvxx.f index 9dc008215..bdaf44d8a 100644 --- a/lapack-netlib/TESTING/LIN/zerrvxx.f +++ b/lapack-netlib/TESTING/LIN/zerrvxx.f @@ -1166,7 +1166,7 @@ $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, $ ERR_BNDS_C, NPARAMS, PARAMS, W, RW, INFO ) CALL CHKXER( 'ZSYSVXX', INFOT, NOUT, LERR, OK ) - INFOT = 13 + INFOT = 13 EQ = 'N' CALL ZSYSVXX( 'N', 'U', 2, 0, A, 2, AF, 2, IP, EQ, R, B, 1, X, $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, From 13a29d13fde096176b0e6f70be2390dd5f3250c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 27 Apr 2021 15:48:22 +0200 Subject: [PATCH 189/681] Apply lapack-testing fix from Reference-LAPACK PR536 fixes changing back from a single OMP thread for error exit testing to the originally requested number of threads for computational tests --- lapack-netlib/TESTING/EIG/cchkee.F | 4 ++-- lapack-netlib/TESTING/EIG/dchkee.F | 2 +- lapack-netlib/TESTING/EIG/schkee.F | 2 +- lapack-netlib/TESTING/EIG/zchkee.F | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index 0d3d7493c..de4aed696 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1871,7 +1871,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL CERRST( 'CST', NOUT ) @@ -2338,7 +2338,7 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL CERRST( 'CHB', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index ee22ce33d..00e8eb57f 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1876,7 +1876,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL DERRST( 'DST', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index a063c18b5..c3f9ca162 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1877,7 +1877,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL SERRST( 'SST', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 29604956d..908b7d651 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1871,7 +1871,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL ZERRST( 'ZST', NOUT ) @@ -2336,7 +2336,7 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL ZERRST( 'ZHB', NOUT ) From aa7b3dc3dbdad15de5a239cb4fc4364815dfbc4d Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 28 Apr 2021 13:56:06 +0000 Subject: [PATCH 190/681] GEMM: skylake: improve the performance when m is small --- kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 79 ++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 9f2bf24e2..15185d7fc 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -149,6 +149,7 @@ #define KERNEL_h_k1m16n2 \ "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ unit_acc_m16n2(8,9,10,11,%1) + #endif #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) @@ -283,7 +284,32 @@ #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +//#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#define unit_acc_k2m4n2(c1_no,c2_no,...)\ + "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\ + "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_ymm(c1_no) \ + "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";" + +#define KERNEL_k1m4n12 \ + "cmpq $2, %5; jb 104912f;"\ + "vmovupd 64+%11,%%zmm30;"\ + "\n204912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \ + unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \ + "subq $2, %5; cmpq $2, %5; jnb 204912b;"\ + unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \ + unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \ + unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \ + "testq %5, %5; jz 1004912f;"\ + "\n104912:"\ + KERNEL_h_k1m4n12 "addq $16,%%r15;"\ + "decq %5; jnz 104912b;"\ + "\n1004912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) @@ -336,7 +362,31 @@ #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +//#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" + +#define unit_acc_k4m2n2(c1_no,c2_no,...) \ + "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_xmm(c1_no) \ + "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\ + "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";" + +#define KERNEL_k1m2n12 \ + "cmpq $4,%5; jb 102912f;"\ + "\n402912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \ + unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \ + "subq $4,%5; cmpq $4,%5; jnb 402912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \ + "testq %5,%5; jz 1002912f;"\ + "\n102912:"\ + KERNEL_h_k1m2n12 "addq $16,%%r15;" \ + "decq %5; jnz 102912b;" \ + "\n1002912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) @@ -387,7 +437,24 @@ #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" -#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +//#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#define KERNEL_k1m1n12 \ + "cmpq $4,%5; jb 101912f;" \ + "vmovupd %11,%%zmm2;"\ + "\n401912:"\ + "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \ + "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\ + "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\ + "subq $4,%5; cmpq $4,%5; jnb 401912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \ + unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + "testq %5,%5; jz 1001912f;"\ + "\n101912:"\ + KERNEL_h_k1m1n12 "addq $16,%%r15;" \ + "decq %5; jnz 101912b;" \ + "\n1001912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" @@ -480,7 +547,7 @@ COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ #ndim"33106:\n\t"\ "movq %%r14,%1;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\ "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ @@ -501,6 +568,10 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; + int64_t permute_table[] = { + 0, 0, 1, 1, 2, 2, 3, 3, // abcdxxxx -> aabbccdd + 0, 1, 0, 1, 2, 3, 2, 3, // abcdxxxx -> ababcdcd + }; #ifdef TRMMKERNEL #ifdef LEFT off = offset; From 2b01132515cc0ca709a4addf2e7101d94234b71e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Apr 2021 19:20:08 +0200 Subject: [PATCH 191/681] Clean up misdeclaration of the dummy stand-in for A in ?ORGBR/?UNGBR workspace queries (Reference-LAPACK PR 468 and 530) --- lapack-netlib/SRC/cungbr.f | 8 ++++---- lapack-netlib/SRC/dorgbr.f | 8 ++++---- lapack-netlib/SRC/sorgbr.f | 8 ++++---- lapack-netlib/SRC/zungbr.f | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index df25799ca..0dddd42a6 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -222,8 +222,8 @@ CALL CUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL CUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL CUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL CUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index cfebda5ab..6868fc38d 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -221,8 +221,8 @@ CALL DORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL DORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL DORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL DORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index dccdbb58a..2266505dc 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -221,8 +221,8 @@ CALL SORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL SORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL SORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL SORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3cdb8127d..c1c35822c 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -222,8 +222,8 @@ CALL ZUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL ZUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL ZUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL ZUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF From 6b760666324806e89dfa4e52191dc7f92a13be3a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Apr 2021 20:55:37 +0200 Subject: [PATCH 192/681] Add const qualifiers --- lapack-netlib/LAPACKE/include/lapack.h | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index aedaa308d..828d3279e 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -566,8 +566,8 @@ void LAPACK_cgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_float const* AB, lapack_int const* ldab, lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -585,8 +585,8 @@ void LAPACK_dgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, double const* AB, lapack_int const* ldab, double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -604,8 +604,8 @@ void LAPACK_sgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, float const* AB, lapack_int const* ldab, float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -623,8 +623,8 @@ void LAPACK_zgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_double const* AB, lapack_int const* ldab, lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -4913,7 +4913,7 @@ void LAPACK_cherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -4931,7 +4931,7 @@ void LAPACK_zherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -8005,7 +8005,7 @@ void LAPACK_cporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -8023,7 +8023,7 @@ void LAPACK_dporfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -8041,7 +8041,7 @@ void LAPACK_sporfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -8059,7 +8059,7 @@ void LAPACK_zporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -10756,7 +10756,7 @@ void LAPACK_csyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -10774,7 +10774,7 @@ void LAPACK_dsyrfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -10792,7 +10792,7 @@ void LAPACK_ssyrfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -10810,7 +10810,7 @@ void LAPACK_zsyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, From 3704f5e5b0a229cbfb3f949dd5fcea557915f49b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 28 Apr 2021 20:56:55 +0200 Subject: [PATCH 193/681] Add missing break statements in the ?lascl functions --- lapack-netlib/LAPACKE/src/lapacke_clascl.c | 1 + lapack-netlib/LAPACKE/src/lapacke_dlascl.c | 1 + lapack-netlib/LAPACKE/src/lapacke_slascl.c | 1 + lapack-netlib/LAPACKE/src/lapacke_zlascl.c | 1 + 4 files changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clascl.c b/lapack-netlib/LAPACKE/src/lapacke_clascl.c index fdcb02947..4f4e0bf35 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl, LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c index 5b579a5d1..058105127 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slascl.c b/lapack-netlib/LAPACKE/src/lapacke_slascl.c index 25bd9624e..62f7390ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_slascl( int matrix_layout, char type, lapack_int kl, LAPACKE_sgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_ssb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c index 7e37d559c..8bf1ee767 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_zlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_zgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_zhb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { From c5fb91f1bc19baac6c874e6a41fd107c40187278 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 29 Apr 2021 09:47:18 +0200 Subject: [PATCH 194/681] Fix division by zero in the non-x86 codepath --- interface/zrotg.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/interface/zrotg.c b/interface/zrotg.c index 8caa411fc..bc4f06492 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ aa_i = fabs(da_r); } - scale = (aa_i / aa_r); - ada = aa_r * sqrt(ONE + scale * scale); + if (aa_r == ZERO) { + ada = 0.; + } else { + scale = (aa_i / aa_r); + ada = aa_r * sqrt(ONE + scale * scale); + } bb_r = fabs(db_r); bb_i = fabs(db_i); @@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ bb_i = fabs(bb_r); } - scale = (bb_i / bb_r); - adb = bb_r * sqrt(ONE + scale * scale); - + if (bb_r == ZERO) { + adb = 0.; + } else { + scale = (bb_i / bb_r); + adb = bb_r * sqrt(ONE + scale * scale); + } scale = ada + adb; aa_r = da_r / scale; From 444cb78be54e76a90d25476893748c44957d3553 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Apr 2021 09:26:54 +0200 Subject: [PATCH 195/681] correct INFO value (Reference-LAPACK 506) --- lapack-netlib/SRC/dlasq2.f | 12 ++++++++++-- lapack-netlib/SRC/slasq2.f | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/dlasq2.f b/lapack-netlib/SRC/dlasq2.f index 68d922870..27eb1f79a 100644 --- a/lapack-netlib/SRC/dlasq2.f +++ b/lapack-netlib/SRC/dlasq2.f @@ -184,10 +184,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 CALL XERBLA( 'DLASQ2', 2 ) RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f index 6e5f86447..219797c4a 100644 --- a/lapack-netlib/SRC/slasq2.f +++ b/lapack-netlib/SRC/slasq2.f @@ -183,10 +183,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 CALL XERBLA( 'SLASQ2', 2 ) RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'SLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) From 87d2e314db541342e42040f0d0ec93147fd9fe04 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Apr 2021 13:50:55 +0200 Subject: [PATCH 196/681] Import packing improvements in LAPACK xLAQR from Reference-LAPACK PR 480+535 --- lapack-netlib/SRC/chseqr.f | 4 +- lapack-netlib/SRC/claqr0.f | 16 +- lapack-netlib/SRC/claqr4.f | 16 +- lapack-netlib/SRC/claqr5.f | 711 ++++++++++++++++-------------------- lapack-netlib/SRC/dhseqr.f | 4 +- lapack-netlib/SRC/dlaqr0.f | 16 +- lapack-netlib/SRC/dlaqr4.f | 16 +- lapack-netlib/SRC/dlaqr5.f | 684 +++++++++++++++-------------------- lapack-netlib/SRC/shseqr.f | 4 +- lapack-netlib/SRC/slaqr0.f | 16 +- lapack-netlib/SRC/slaqr4.f | 16 +- lapack-netlib/SRC/slaqr5.f | 686 +++++++++++++++-------------------- lapack-netlib/SRC/zhseqr.f | 4 +- lapack-netlib/SRC/zlaqr0.f | 16 +- lapack-netlib/SRC/zlaqr4.f | 16 +- lapack-netlib/SRC/zlaqr5.f | 712 ++++++++++++++++--------------------- 16 files changed, 1267 insertions(+), 1670 deletions(-) diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index cfcf725b2..32b6fa87b 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -320,10 +320,10 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare CLAHQR failure. NL > NTINY = 11 is +* . through a rare CLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f index 2f0ea20db..233721352 100644 --- a/lapack-netlib/SRC/claqr0.f +++ b/lapack-netlib/SRC/claqr0.f @@ -260,7 +260,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -355,22 +355,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -418,7 +418,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -558,7 +558,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAQR4 or * . CLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -659,7 +659,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f index fba286df7..94484e798 100644 --- a/lapack-netlib/SRC/claqr4.f +++ b/lapack-netlib/SRC/claqr4.f @@ -270,7 +270,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -365,22 +365,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -428,7 +428,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -568,7 +568,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -663,7 +663,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index e4317a3ad..71f26d8c9 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: CLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: CLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complexOTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX ALPHA, BETA, CDUM, REFSUM REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = CONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*CONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+ - $ CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF * -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,22 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = CONJG( V( 1, M ) )* + $ ( H( K+1, J )+CONJG( V( 2, M ) )* + $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== * -* ==== Fill in the last row of each bulge. ==== + IF( ACCUM ) THEN * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) ) - 130 CONTINUE +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -687,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of CLAQR5 ==== * diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f index b4fc3af90..6b7fb308f 100644 --- a/lapack-netlib/SRC/dhseqr.f +++ b/lapack-netlib/SRC/dhseqr.f @@ -338,10 +338,10 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare DLAHQR failure. NL > NTINY = 11 is +* . through a rare DLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f index f362c096c..8334d8d2b 100644 --- a/lapack-netlib/SRC/dlaqr0.f +++ b/lapack-netlib/SRC/dlaqr0.f @@ -278,7 +278,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -362,22 +362,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -425,7 +425,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -576,7 +576,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAQR4 or * . DLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -698,7 +698,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f index 454bf9608..163e55deb 100644 --- a/lapack-netlib/SRC/dlaqr4.f +++ b/lapack-netlib/SRC/dlaqr4.f @@ -284,7 +284,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -368,22 +368,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -431,7 +431,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -582,7 +582,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index f58db9c89..12e7db637 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: DLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3) +*> U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3) +*> WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup doubleOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF * -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) - CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of DLAQR5 ==== * diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index b5707f2c3..d22bd7b94 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -338,10 +338,10 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare SLAHQR failure. NL > NTINY = 11 is +* . through a rare SLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f index 318b46943..b1ebaff75 100644 --- a/lapack-netlib/SRC/slaqr0.f +++ b/lapack-netlib/SRC/slaqr0.f @@ -277,7 +277,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -361,22 +361,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -424,7 +424,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -575,7 +575,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAQR4 or * . SLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index cd642e07f..4ba2f8757 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -287,7 +287,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -371,22 +371,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -434,7 +434,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -585,7 +585,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -700,7 +700,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index f04ee577e..d60a1d3c0 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: SLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: SLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is REAL array, dimension (LDU,3*NSHFTS-3) +*> U is REAL array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is REAL array, dimension (LDWV,3*NSHFTS-3) +*> WV is REAL array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup realOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS-1 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF * -* ==== Multiply H by reflections from the left ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM* - $ V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * -* ==== Vigilant deflation check ==== +* ==== Perform update from left for subsequent +* . column. ==== * - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Accumulate orthogonal transformations. ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of SLAQR5 ==== * diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f index 2ee874dfd..e0fddd3a7 100644 --- a/lapack-netlib/SRC/zhseqr.f +++ b/lapack-netlib/SRC/zhseqr.f @@ -320,10 +320,10 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare ZLAHQR failure. NL > NTINY = 11 is +* . through a rare ZLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f index feffe9782..edf01bc7c 100644 --- a/lapack-netlib/SRC/zlaqr0.f +++ b/lapack-netlib/SRC/zlaqr0.f @@ -262,7 +262,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -357,22 +357,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -420,7 +420,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -560,7 +560,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAQR4 or * . ZLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f index a88f6508e..6d083fcda 100644 --- a/lapack-netlib/SRC/zlaqr4.f +++ b/lapack-netlib/SRC/zlaqr4.f @@ -268,7 +268,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -363,22 +363,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -426,7 +426,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -566,7 +566,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 9ff7e7eca..c12f4b780 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: ZLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: ZLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX*16 array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complex16OTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX*16 ALPHA, BETA, CDUM, REFSUM DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = DCONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*DCONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF * -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,23 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF * -* ==== Fill in the last row of each bulge. ==== + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = DCONJG( V( 1, M ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M ) )* + $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 130 CONTINUE +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -688,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of ZLAQR5 ==== * From c59652f0ce88ea7bba97704f332c3ec77bd528c9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:14:58 -0400 Subject: [PATCH 197/681] optimize on sgemv_n for small n --- kernel/x86_64/sgemv_n_4.c | 56 ++++- kernel/x86_64/sgemv_n_microk_skylakex-8.c | 258 ++++++++++++++++++++++ 2 files changed, 304 insertions(+), 10 deletions(-) create mode 100644 kernel/x86_64/sgemv_n_microk_skylakex-8.c diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 3eec21774..81d495eae 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,8 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_n_microk_haswell-4.c" +#include "sgemv_n_microk_skylakex-8.c" +#endif + #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -291,6 +296,41 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + if ( m < 1 || n < 1) return(0); + + #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL + if (m <= 16384 && n <= 48 && !(n == 4)) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + FLOAT * xbuffer = NULL; + FLOAT * ybuffer = NULL; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 +#include "common.h" +#include +static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7; + __m512 xArray_0; + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4); + accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5); + accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6); + accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + } + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + } + + if(tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + } + + if(tag_m_32x != m) { + + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + accum512_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + } + } + } + return 0; +} + +static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7; + __m256 as0, as1, as2, as3, as4, as5, as6, as7; + __m256 alphav = _mm256_set1_ps(alpha); + __m256 xv; + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __mmask8 one_mask = 0xff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + as0 = _mm256_setzero_ps(); + as1 = _mm256_setzero_ps(); + as2 = _mm256_setzero_ps(); + as3 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]); + ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]); + + as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0); + as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1); + as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2); + as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16]))); + _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24]))); + + } + + if (tag_m_32x != m ) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + as4 = _mm256_setzero_ps(); + as5 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + + as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4); + as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + } + + if (tag_m_16x != m ) { + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + as6 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]); + as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + } + + if (tag_m_8x != m) { + as7 = _mm256_setzero_ps(); + + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]); + + as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7); + } + + _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x]))); + + } + } + } + + return 0; +} + + +#endif \ No newline at end of file From 3d4ccd2a130447eb7e0b8f5326dcd6e856fb8de9 Mon Sep 17 00:00:00 2001 From: pnp Date: Fri, 30 Apr 2021 12:25:33 -0400 Subject: [PATCH 198/681] fix for build error --- kernel/x86_64/sgemv_n_4.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 81d495eae..bc006bf3c 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -42,8 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_skylakex-8.c" #endif -#endif - #if defined(STEAMROLLER) || defined(EXCAVATOR) #define NBMAX 2048 #else From 53ee0b76bb066b928b4288e2dbb6ca25c389ea86 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Apr 2021 18:01:14 -0700 Subject: [PATCH 199/681] x86: Enable Intel CET When Intel CET is enabled, we need to include in assembly codes to mark Intel CET support and place _CET_ENDBR at the function entry. --- common.h | 9 +++++++++ common_x86.h | 3 ++- common_x86_64.h | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index 862e0b4db..ac795937c 100644 --- a/common.h +++ b/common.h @@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_alpha.h" #endif +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif +#ifndef _CET_ENDBR +#define _CET_ENDBR +#endif + #ifdef ARCH_X86 #include "common_x86.h" #endif diff --git a/common_x86.h b/common_x86.h index ec928e236..bc77eca58 100644 --- a/common_x86.h +++ b/common_x86.h @@ -340,7 +340,8 @@ REALNAME: .align 16; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call mcount diff --git a/common_x86_64.h b/common_x86_64.h index b813336c6..729a055ce 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,7 +451,8 @@ REALNAME: .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) From 254774f5a621b7ecd3775ac0c436f6e47189b909 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 13:10:16 +0200 Subject: [PATCH 200/681] Add const qualifiers --- lapack-netlib/LAPACKE/include/lapack.h | 12 ++++++------ lapack-netlib/LAPACKE/include/lapacke.h | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 828d3279e..341efabda 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -4768,7 +4768,7 @@ void LAPACK_chegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, - lapack_complex_float* B, lapack_int const* ldb, + const lapack_complex_float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) @@ -4776,7 +4776,7 @@ void LAPACK_zhegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, - lapack_complex_double* B, lapack_int const* ldb, + const lapack_complex_double* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) @@ -11556,7 +11556,7 @@ void LAPACK_zsytrs( void LAPACK_csytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_float* B, lapack_int const* ldb, lapack_complex_float* work, lapack_int* info ); @@ -11565,7 +11565,7 @@ void LAPACK_csytrs2( void LAPACK_dsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - double* A, lapack_int const* lda, lapack_int const* ipiv, + const double* A, lapack_int const* lda, lapack_int const* ipiv, double* B, lapack_int const* ldb, double* work, lapack_int* info ); @@ -11574,7 +11574,7 @@ void LAPACK_dsytrs2( void LAPACK_ssytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - float* A, lapack_int const* lda, lapack_int const* ipiv, + const float* A, lapack_int const* lda, lapack_int const* ipiv, float* B, lapack_int const* ldb, float* work, lapack_int* info ); @@ -11583,7 +11583,7 @@ void LAPACK_ssytrs2( void LAPACK_zsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_double* B, lapack_int const* ldb, lapack_complex_double* work, lapack_int* info ); diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 012c104bb..b280dde0a 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo, lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, @@ -6932,11 +6932,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range, lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz, @@ -10553,11 +10553,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb ); lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ); @@ -10718,10 +10718,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, double* work, lapack_int nb ); lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ); lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10813,10 +10813,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, float* work, lapack_int nb ); lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ); lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10898,11 +10898,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb ); lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ); From 5cc35abc3dfa3ae5c4466948b23cdd08ca366a1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 13:22:10 +0200 Subject: [PATCH 201/681] Apply MKL team fixes to the LAPACKE interfaces (Reference-LAPACK PR 534) Removed spurious checks for INFO in xLACPY,xLASET after routines not returning any,and redundant requirements for ldvt in xGESVD_WORK --- lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_cheev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chegv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_chegvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_chetri2x.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_claset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_csyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_csytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ctrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cungtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_cunmtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_dorgtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsygst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsygv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dsygvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_dtrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_slaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_sorgtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_sormtr.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssygst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssygv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ssygvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_strttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_strttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_zheev_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegst.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhegv.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegvd.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_zhegvx.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c | 3 --- lapack-netlib/LAPACKE/src/lapacke_zsyconv.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztrttf.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_ztrttp.c | 2 +- lapack-netlib/LAPACKE/src/lapacke_zungtr.c | 2 +- 75 files changed, 87 insertions(+), 103 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c index 558a7f308..4256c0f04 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index aa78e678e..dbb2753d1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index d26c84785..2f25c187a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index e8f212efb..9e8a1c4db 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c index ff7dd3532..c628017c2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c index a29e01961..001863819 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c index 15d052987..c01525662 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c index 537b9450b..fc3395833 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c index 98c901982..fe7b39cee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c index 3ba62746e..d56e3ee46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c index 6937752c4..fc0d4e3d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c index 80d262626..eba359312 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c index 7b25815e7..1b4fed17a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c index 2eb942e4e..771395e97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c index 44405c993..f4a0a4334 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c index 8567a07d5..d914c1d69 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c index fd0a40c17..8ca652456 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c index c4ea703af..7b2e3a169 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c index ddae70345..faa3ef6d3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index d9fb2dca0..71ad23f2f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c index 7dbc9bb88..671def1df 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c index f1a505486..88f4489a3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c index 4b59fe627..f1444b5e2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c index 86184b784..587805de6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index db75a6609..0b1c54b9b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c index cca9be489..36ff7c40c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index f696c608f..78f9e80ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 6f9c02f6a..d68989aa6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index 81ba2acb3..25d075d46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c index 800a30b24..69b90e758 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c index 533b6a446..4ece69794 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c index 974b63e54..0016a7d06 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c index 51f333359..0db0cfa67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c index 02d54d7fa..54fa6ff36 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c index 4d73ef3c1..46c90190f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c index caffa5b4b..c937c39c5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c index 66d1e5a2c..de379a970 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c index 89f01dc95..d17593471 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c index 9dc5509c9..941d83cad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c index e60167001..cdec2c967 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c index c89c9a6e1..4f2fa7b67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c index 90dc435c9..804b7f8ef 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 9f0e9fddf..6ffe144cc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c index 5fd0a78c5..ac41a354d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index abd62ddf3..1889a337c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index d9fe47599..faadc92f1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index bfbf49aee..434b52c01 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c index 7b97f472b..4fb55960c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c index 8ec40d954..f139de1ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c index a2eba6653..195fb1e54 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c index 5afe8d2de..e33ce2a7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c index 1fe4e2c6c..8ffd9dc40 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c index 19f447cd8..a95a71469 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c index 7d348b382..cf98f443d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c index fee7ab9ae..e3304fbe7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c index 6c4b84aa3..2df79eb05 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c index 2d7c2b6f3..da73cd479 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index d4e93aed2..8b7aa3518 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index fb33c3e2a..840c53876 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index 5af2a1269..b8509e04f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c index 8c4a5c374..aa2d84d84 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c index 62fce1f27..f77894204 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c index 683fcf487..587e2d4be 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c index 0f1b415a9..43569d99e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c index 1242a0eda..c287595ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c index 492bc4dad..83f2bda2e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c index a07bc8d52..15a8cc576 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c index bb4e57b1e..fe36ed811 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c index 9056e8fca..ecb6cba25 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c index 2826efa53..074b15303 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c index 7442702aa..3c85f9796 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c index ec05ce6d5..cdc97fa02 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, - lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c index 8a5dfc271..8e8789ec6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c index 5dcf633bb..bd8485108 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c index 51785347e..adfaa7db9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) { From 904b221f03a986be12b30bc21c872eaa79a6427e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 14:47:22 +0200 Subject: [PATCH 202/681] Add cast to prevent overflow of intermediate result --- interface/imatcopy.c | 4 ++-- interface/zimatcopy.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 93ffd69f9..91975f7f4 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); else - msize = (*ldb) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 87964e20d..ecda5ef4e 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) From 98ebc8ac5987af4ef44618d95e34ae122ec24c20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 May 2021 14:48:19 +0200 Subject: [PATCH 203/681] Add casts to prevent overflow of intermediate result --- ctest/c_cblas2.c | 44 +++++++++++++++--------------- ctest/c_dblas2.c | 42 ++++++++++++++--------------- ctest/c_dblas3.c | 46 +++++++++++++++---------------- ctest/c_sblas2.c | 42 ++++++++++++++--------------- ctest/c_sblas3.c | 46 +++++++++++++++---------------- ctest/c_zblas2.c | 44 +++++++++++++++--------------- ctest/c_zblas3.c | 70 ++++++++++++++++++++++++------------------------ 7 files changed, 167 insertions(+), 167 deletions(-) diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c index 057096f32..6511e5271 100644 --- a/ctest/c_cblas2.c +++ b/ctest/c_cblas2.c @@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; *incx, beta, y, *incy ); else { LDA = *k+2; - A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, beta, y, *incy); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); - AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, x, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA=*n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha, cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, *incy, ap ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { @@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c index 423a58748..ae3854c0e 100644 --- a/ctest/c_dblas2.c +++ b/ctest/c_dblas2.c @@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) @@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, if (*order == TEST_ROW_MJR) { LDA = *n; - A = ( double* )malloc( LDA*LDA*sizeof( double ) ); - AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i Date: Sat, 1 May 2021 21:31:13 +0200 Subject: [PATCH 204/681] Fix possible division by zero in xTGSJA (Reference-LAPACK PR502) --- lapack-netlib/SRC/ctgsja.f | 9 +++++---- lapack-netlib/SRC/dtgsja.f | 9 +++++---- lapack-netlib/SRC/stgsja.f | 9 +++++---- lapack-netlib/SRC/ztgsja.f | 9 +++++---- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f index 38a61068e..c96cbe022 100644 --- a/lapack-netlib/SRC/ctgsja.f +++ b/lapack-netlib/SRC/ctgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), @@ -424,7 +424,8 @@ $ SLARTG, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, CONJG, MAX, MIN, REAL + INTRINSIC ABS, CONJG, MAX, MIN, REAL, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = REAL( A( K+I, N-L+I ) ) B1 = REAL( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f index 66f32b790..537bd3f4f 100644 --- a/lapack-netlib/SRC/dtgsja.f +++ b/lapack-netlib/SRC/dtgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ DSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f index 2a6fc354d..7324da431 100644 --- a/lapack-netlib/SRC/stgsja.f +++ b/lapack-netlib/SRC/stgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ SSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f index 851f6504a..c80e33158 100644 --- a/lapack-netlib/SRC/ztgsja.f +++ b/lapack-netlib/SRC/ztgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), @@ -424,7 +424,8 @@ $ ZLASET, ZROT * .. * .. Intrinsic Functions .. - INTRINSIC ABS, DBLE, DCONJG, MAX, MIN + INTRINSIC ABS, DBLE, DCONJG, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = DBLE( A( K+I, N-L+I ) ) B1 = DBLE( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) From d77d9bc920affa2f2d3e0a5479cb05c676a9246e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 11:24:50 +0200 Subject: [PATCH 205/681] Handle norm NaN value (Reference LAPACK PR471) --- lapack-netlib/SRC/cgesdd.f | 8 ++++++-- lapack-netlib/SRC/dgesdd.f | 8 ++++++-- lapack-netlib/SRC/sgesdd.f | 8 ++++++-- lapack-netlib/SRC/zgesdd.f | 8 ++++++-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index 07341593f..34a80beea 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -281,9 +281,9 @@ $ CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLAMCH, CLANGE + EXTERNAL LSAME, SLAMCH, CLANGE, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = CLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN ( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 0218900d2..80d18041c 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL DLAMCH, DLANGE, LSAME + EXTERNAL DLAMCH, DLANGE, LSAME, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = DLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index 689494dd1..89e03a002 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, LSAME + EXTERNAL SLAMCH, SLANGE, LSAME, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = SLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index bb9d2c26e..2209f4733 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -281,9 +281,9 @@ $ ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = ZLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 From c26780d4510447ca101bfc44a3ad87018a3e9d8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 11:40:56 +0200 Subject: [PATCH 206/681] Initialize X and Y to zero for N=0 (Reference-LAPACK PR463) --- lapack-netlib/SRC/cggglm.f | 11 +++++++++-- lapack-netlib/SRC/dggglm.f | 11 +++++++++-- lapack-netlib/SRC/sggglm.f | 11 +++++++++-- lapack-netlib/SRC/zggglm.f | 11 +++++++++-- 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 336f41909..9c8e0eec3 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index 2e92912e0..1fbdc8add 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index fe63da5f5..572ee511d 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index d6a30cee7..d4adc5c4d 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * From 4bf00da8fbd85a3478086c822e8df4606ecefdc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 12:18:17 +0200 Subject: [PATCH 207/681] Avoid allocating the transposed triangular matrix (Reference-LAPACK PR382) --- .../LAPACKE/src/lapacke_clantr_work.c | 39 ++++++++++--------- .../LAPACKE/src/lapacke_dlantr_work.c | 38 +++++++++--------- .../LAPACKE/src/lapacke_slantr_work.c | 38 +++++++++--------- .../LAPACKE/src/lapacke_zlantr_work.c | 39 ++++++++++--------- 4 files changed, 80 insertions(+), 74 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c index 8c4c21935..4779f10d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c @@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_clantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_clantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 5b2a6c535..9c9b0ea8b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index e1d4c270d..f77abef2c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_slantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_slantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index e62f8a4e3..cccc4053e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); } } else { From 40000d1f64cc7297305cee6267dbb0e5e45df6d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:21:59 +0200 Subject: [PATCH 208/681] Add entries for Householder reconstruction functions from 3.9.1 --- cmake/lapack.cmake | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 73f2592ef..0e45d4c63 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -66,7 +66,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f @@ -112,14 +112,14 @@ set(SLASRC sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f stpqrt.f stpqrt2.f stpmqrt.f stprfb.f sgelqt.f sgelqt3.f sgemlqt.f - sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f + sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f sgelq.f slaswlq.f slamswlq.f sgemlq.f stplqt.f stplqt2.f stpmlqt.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -171,7 +171,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarfb.f clarfg.f clarfgp.f clarft.f + clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f @@ -209,14 +209,14 @@ set(CLASRC cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cgelqt.f cgelqt3.f cgemlqt.f - cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f + cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f cgelq.f claswlq.f clamswlq.f cgemlq.f ctplqt.f ctplqt2.f ctpmlqt.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cunhr_col.f ) + cungtsqr.f cungtsqr_row.f cunhr_col.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -253,7 +253,7 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlargv.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f @@ -300,14 +300,14 @@ set(DLASRC dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dgelqt.f dgelqt3.f dgemlqt.f - dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f + dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f dgelq.f dlaswlq.f dlamswlq.f dgemlq.f dtplqt.f dtplqt2.f dtpmlqt.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f ) + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -360,7 +360,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f + zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f @@ -402,13 +402,13 @@ set(ZLASRC ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f ztplqt.f ztplqt2.f ztpmlqt.f zgelqt.f zgelqt3.f zgemlqt.f - zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f + zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f zgelq.f zlaswlq.f zlamswlq.f zgemlq.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zunhr_col.f) + zungtsqr.f zungtsqr_row.f zunhr_col.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f From 4c1d47098bcd5d2c06e01d065068aa57d62a2953 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:25:43 +0200 Subject: [PATCH 209/681] Add new files for Householder reconstruction functions from 3.9.1 --- lapack-netlib/SRC/Makefile | 22 +- lapack-netlib/SRC/cgetsqrhrt.f | 349 ++++++++++++++++++ lapack-netlib/SRC/clarfb_gett.f | 597 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/cungtsqr_row.f | 380 ++++++++++++++++++++ lapack-netlib/SRC/dgetsqrhrt.f | 349 ++++++++++++++++++ lapack-netlib/SRC/dlarfb_gett.f | 596 ++++++++++++++++++++++++++++++ lapack-netlib/SRC/dorgtsqr_row.f | 379 ++++++++++++++++++++ lapack-netlib/SRC/sgetsqrhrt.f | 349 ++++++++++++++++++ lapack-netlib/SRC/slarfb_gett.f | 596 ++++++++++++++++++++++++++++++ lapack-netlib/SRC/sorgtsqr_row.f | 379 ++++++++++++++++++++ lapack-netlib/SRC/zgetsqrhrt.f | 349 ++++++++++++++++++ lapack-netlib/SRC/zlarfb_gett.f | 597 +++++++++++++++++++++++++++++++ lapack-netlib/SRC/zungtsqr_row.f | 380 ++++++++++++++++++++ 13 files changed, 5311 insertions(+), 11 deletions(-) create mode 100644 lapack-netlib/SRC/cgetsqrhrt.f create mode 100644 lapack-netlib/SRC/clarfb_gett.f create mode 100644 lapack-netlib/SRC/cungtsqr_row.f create mode 100644 lapack-netlib/SRC/dgetsqrhrt.f create mode 100644 lapack-netlib/SRC/dlarfb_gett.f create mode 100644 lapack-netlib/SRC/dorgtsqr_row.f create mode 100644 lapack-netlib/SRC/sgetsqrhrt.f create mode 100644 lapack-netlib/SRC/slarfb_gett.f create mode 100644 lapack-netlib/SRC/sorgtsqr_row.f create mode 100644 lapack-netlib/SRC/zgetsqrhrt.f create mode 100644 lapack-netlib/SRC/zlarfb_gett.f create mode 100644 lapack-netlib/SRC/zungtsqr_row.f diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 83baac875..470b5326e 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -135,14 +135,14 @@ SLASRC_O = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ - sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \ + sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ spbstf.o spbsv.o spbsvx.o \ @@ -181,7 +181,7 @@ SLASRC_O = \ sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \ sgelqt.o sgelqt3.o sgemlqt.o \ - sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ + sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ sgelq.o slaswlq.o slamswlq.o sgemlq.o \ stplqt.o stplqt2.o stpmlqt.o \ sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \ @@ -250,7 +250,7 @@ CLASRC_O = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ @@ -278,7 +278,7 @@ CLASRC_O = \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ - cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ + cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ @@ -342,14 +342,14 @@ DLASRC_O = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ dlargv.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ - dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \ + dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ dpbstf.o dpbsv.o dpbsvx.o \ @@ -389,7 +389,7 @@ DLASRC_O = \ dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \ dgelqt.o dgelqt3.o dgemlqt.o \ - dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ + dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \ dtplqt.o dtplqt2.o dtpmlqt.o \ dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \ @@ -455,7 +455,7 @@ ZLASRC_O = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ @@ -484,7 +484,7 @@ ZLASRC_O = \ ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ - zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ + zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ @@ -498,7 +498,7 @@ ZLASRC_O = \ ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \ ztplqt.o ztplqt2.o ztpmlqt.o \ zgelqt.o zgelqt3.o zgemlqt.o \ - zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ + zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \ zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f new file mode 100644 index 000000000..4e4dc1d4a --- /dev/null +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b CGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in CGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of CGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for CLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for CUNGTSQR_ROW; +* d) Diagonal D for CUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of CUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f new file mode 100644 index 000000000..ee6959ed8 --- /dev/null +++ b/lapack-netlib/SRC/clarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b CLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CGEMM, CTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of CLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/cungtsqr_row.f b/lapack-netlib/SRC/cungtsqr_row.f new file mode 100644 index 000000000..e1597c58b --- /dev/null +++ b/lapack-netlib/SRC/cungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b CUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of CLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by CLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of CLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine CLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which CLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by CLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by CLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by CLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See CLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See CLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL CLARFB_GETT, CLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL CLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CUNGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f new file mode 100644 index 000000000..668deeba8 --- /dev/null +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b DGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a real M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in DGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of DGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for DLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for DORGTSQR_ROW; +* d) Diagonal D for DORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of DORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f new file mode 100644 index 000000000..10ab6461e --- /dev/null +++ b/lapack-netlib/SRC/dlarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b DLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGEMM, DTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of DLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/dorgtsqr_row.f b/lapack-netlib/SRC/dorgtsqr_row.f new file mode 100644 index 000000000..94f8b0120 --- /dev/null +++ b/lapack-netlib/SRC/dorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b DORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of DLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by DLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of DLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine DLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which DLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by DLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by DLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See DLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See DLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + DOUBLE PRECISION DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL DLARFB_GETT, DLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL DLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f new file mode 100644 index 000000000..f9580da7b --- /dev/null +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b SGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in SGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of SGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is REAL array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for SLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for SORGTSQR_ROW; +* d) Diagonal D for SORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of SORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f new file mode 100644 index 000000000..7719f2965 --- /dev/null +++ b/lapack-netlib/SRC/slarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b SLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGEMM, STRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of SLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f new file mode 100644 index 000000000..d2a2150cd --- /dev/null +++ b/lapack-netlib/SRC/sorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b SORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of SLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by SLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of SLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine SLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which SLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by SLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by SLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by SLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See SLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See SLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup sigleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + REAL DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL SLARFB_GETT, SLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL SLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f new file mode 100644 index 000000000..5f0167937 --- /dev/null +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b ZGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in ZGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of ZGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for ZLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for ZUNGTSQR_ROW; +* d) Diagonal D for ZUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of ZUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f new file mode 100644 index 000000000..4a3c4dcf1 --- /dev/null +++ b/lapack-netlib/SRC/zlarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b ZLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZGEMM, ZTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of ZLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/zungtsqr_row.f b/lapack-netlib/SRC/zungtsqr_row.f new file mode 100644 index 000000000..0d32ad6ce --- /dev/null +++ b/lapack-netlib/SRC/zungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b ZUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of ZLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by ZLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of ZLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine ZLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which ZLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by ZLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by ZLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by ZLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See ZLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See ZLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX*16 DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL ZLARFB_GETT, ZLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL ZLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZUNGTSQR_ROW +* + END From 88b70fba3e5a655ea56e7019b614fc4d3c2881bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:28:21 +0200 Subject: [PATCH 210/681] Add new tests for Householder reconstruction functions from 3.9.1 --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 8 +- lapack-netlib/TESTING/LIN/Makefile | 8 +- lapack-netlib/TESTING/LIN/cchkunhr_col.f | 97 +++++- lapack-netlib/TESTING/LIN/cunhr_col01.f | 67 ++-- lapack-netlib/TESTING/LIN/cunhr_col02.f | 381 +++++++++++++++++++++++ lapack-netlib/TESTING/LIN/dchkorhr_col.f | 97 +++++- lapack-netlib/TESTING/LIN/dorhr_col01.f | 57 +++- lapack-netlib/TESTING/LIN/dorhr_col02.f | 377 ++++++++++++++++++++++ lapack-netlib/TESTING/LIN/schkorhr_col.f | 95 +++++- lapack-netlib/TESTING/LIN/sorhr_col01.f | 65 ++-- lapack-netlib/TESTING/LIN/sorhr_col02.f | 376 ++++++++++++++++++++++ lapack-netlib/TESTING/LIN/zchkunhr_col.f | 97 +++++- lapack-netlib/TESTING/LIN/zunhr_col01.f | 61 +++- lapack-netlib/TESTING/LIN/zunhr_col02.f | 381 +++++++++++++++++++++++ 14 files changed, 2033 insertions(+), 134 deletions(-) create mode 100644 lapack-netlib/TESTING/LIN/cunhr_col02.f create mode 100644 lapack-netlib/TESTING/LIN/dorhr_col02.f create mode 100644 lapack-netlib/TESTING/LIN/sorhr_col02.f create mode 100644 lapack-netlib/TESTING/LIN/zunhr_col02.f diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 0d0bb5418..309ed7e77 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -40,7 +40,7 @@ set(SLINTST schkaa.f sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f schklqt.f schklqtp.f schktsqr.f serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f - schkorhr_col.f serrorhr_col.f sorhr_col01.f) + schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f) if(USE_XBLAS) list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f @@ -96,7 +96,7 @@ set(CLINTST cchkaa.f cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f cchklqt.f cchklqtp.f cchktsqr.f cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f - cchkunhr_col.f cerrunhr_col.f cunhr_col01.f) + cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f) if(USE_XBLAS) list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f @@ -142,7 +142,7 @@ set(DLINTST dchkaa.f dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f dchklq.f dchklqt.f dchklqtp.f dchktsqr.f derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f - dchkorhr_col.f derrorhr_col.f dorhr_col01.f) + dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f) if(USE_XBLAS) list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f @@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f zchklqt.f zchklqtp.f zchktsqr.f zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f - zchkunhr_col.f zerrunhr_col.f zunhr_col01.f) + zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f) if(USE_XBLAS) list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 6e790aa93..674265816 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -74,7 +74,7 @@ SLINTST = schkaa.o \ sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \ schklqt.o schklqtp.o schktsqr.o \ serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \ - schkorhr_col.o serrorhr_col.o sorhr_col01.o + schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o ifdef USEXBLAS SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \ @@ -123,7 +123,7 @@ CLINTST = cchkaa.o \ cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \ cchklqt.o cchklqtp.o cchktsqr.o \ cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \ - cchkunhr_col.o cerrunhr_col.o cunhr_col01.o + cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o ifdef USEXBLAS CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \ @@ -167,7 +167,7 @@ DLINTST = dchkaa.o \ dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \ dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \ derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \ - dchkorhr_col.o derrorhr_col.o dorhr_col01.o + dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o ifdef USEXBLAS DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \ @@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \ zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \ zchklqt.o zchklqtp.o zchktsqr.o \ zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \ - zchkunhr_col.o zerrunhr_col.o zunhr_col01.o + zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o ifdef USEXBLAS ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \ diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f index 00077ddd9..0d6a9063d 100644 --- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR -*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested -*> before this test. +*> CCHKUNHR_COL tests: +*> 1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT, +*> 2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex_lin * * ===================================================================== - SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01 + EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01, + $ CUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test CUNHR_COL * - CALL CUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL CUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test CUNHR_COL +* + CALL CUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of CCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f index d760caba5..d77d60b1a 100644 --- a/lapack-netlib/TESTING/LIN/cunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f @@ -13,7 +13,7 @@ * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* DOUBLE PRECISION RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR. -*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR +*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup complex16_lin +*> \ingroup complex_lin * * ===================================================================== SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,10 +131,10 @@ * * .. * .. Local allocatable arrays - COMPLEX, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) - REAL, ALLOCATABLE :: RWORK(:) + REAL , ALLOCATABLE :: RWORK(:) * * .. Parameters .. REAL ZERO @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, AF, M, R, M ) + CALL CLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, R, M, AF, M ) + CALL CLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/cunhr_col02.f b/lapack-netlib/TESTING/LIN/cunhr_col02.f new file mode 100644 index 000000000..001f291da --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b CUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + REAL , ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CLANGE, CLANSY + EXTERNAL SLAMCH, CLANGE, CLANSY +* .. +* .. External Subroutines .. + EXTERNAL CLACPY, CLARNV, CLASET, CGETSQRHRT, + $ CSCAL, CGEMM, CGEMQRT, CHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL CLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in CLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* CGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'CGETSQRHRT' + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL CLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = CLANGE( '1', M, N, A, M, RWORK ) + RESID = CLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL CLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = CLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = CLANGE( '1', M, N, C, M, RWORK ) + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL CLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = CLANGE( '1', N, M, D, N, RWORK ) + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of CUNHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f index 3b3e421eb..0e2d44d8d 100644 --- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR -*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested -*> before this test. +*> DCHKORHR_COL tests: +*> 1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT, +*> 2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup double_lin * * ===================================================================== - SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01 + EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01, + $ DORHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test DORHR_COL * - CALL DORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL DORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test DORHR_COL +* + CALL DORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of DCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f index 3e48de37f..979255ca9 100644 --- a/lapack-netlib/TESTING/LIN/dorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR. -*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR +*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup single_lin +*> \ingroup double_lin * * ===================================================================== SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 diff --git a/lapack-netlib/TESTING/LIN/dorhr_col02.f b/lapack-netlib/TESTING/LIN/dorhr_col02.f new file mode 100644 index 000000000..d4c438edb --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f @@ -0,0 +1,377 @@ +*> \brief \b DORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + DOUBLE PRECISION, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + DOUBLE PRECISION WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DLANGE, DLANSY + EXTERNAL DLAMCH, DLANGE, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DLACPY, DLARNV, DLASET, DGETSQRHRT, + $ DSCAL, DGEMM, DGEMQRT, DSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL DLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in DLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* DGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'DGETSQRHRT' + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL DLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = DLANGE( '1', M, N, A, M, RWORK ) + RESID = DLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL DLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = DLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = DLANGE( '1', M, N, C, M, RWORK ) + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL DLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = DLANGE( '1', N, M, D, N, RWORK ) + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of DORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f index cf6d2d323..f61b74902 100644 --- a/lapack-netlib/TESTING/LIN/schkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f @@ -24,8 +24,11 @@ *> *> \verbatim *> -*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SCHKORHR_COL tests: +*> 1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT, +*> 2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup sigle_lin +*> \ingroup single_lin * * ===================================================================== - SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2019 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,7 +135,8 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01 + EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01, + $ SORHR_COL02 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -201,8 +202,8 @@ * * Test SORHR_COL * - CALL SORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL SORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test SORHR_COL +* + CALL SORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of SCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f index 02429041b..dcc2c1cae 100644 --- a/lapack-netlib/TESTING/LIN/sorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f @@ -8,12 +8,12 @@ * Definition: * =========== * -* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT) +* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) * * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* REAL RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup single_lin * * ===================================================================== SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - REAL, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) * @@ -128,7 +157,7 @@ $ SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK * .. * .. Intrinsic Functions .. - INTRINSIC CEILING, MAX, MIN, REAL + INTRINSIC CEILING, REAL, MAX, MIN * .. * .. Scalars in Common .. CHARACTER(LEN=32) SRNAMT @@ -230,7 +259,7 @@ * * Compute the factor R_hr corresponding to the Householder * reconstructed Q_hr and place it in the upper triangle of AF to -* match the Q storage format in DGEQRT. R_hr = R_tsqr * S, +* match the Q storage format in SGEQRT. R_hr = R_tsqr * S, * this means changing the sign of I-th row of the matrix R_tsqr * according to sign of of I-th diagonal element DIAG(I) of the * matrix S. diff --git a/lapack-netlib/TESTING/LIN/sorhr_col02.f b/lapack-netlib/TESTING/LIN/sorhr_col02.f new file mode 100644 index 000000000..1cbe40577 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f @@ -0,0 +1,376 @@ +*> \brief \b SORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + REAL WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SLANGE, SLANSY + EXTERNAL SLAMCH, SLANGE, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SLACPY, SLARNV, SLASET, SGETSQRHRT, + $ SSCAL, SGEMM, SGEMQRT, SSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL SLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in SLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* SGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'SGETSQRHRT' + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL SLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = SLANGE( '1', M, N, A, M, RWORK ) + RESID = SLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL SLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = SLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = SLANGE( '1', M, N, C, M, RWORK ) + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL SLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = SLANGE( '1', N, M, D, N, RWORK ) + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of SORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f index ef8f8bcc4..395ea178a 100644 --- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR -*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested -*> before this test. +*> ZCHKUNHR_COL tests: +*> 1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT, +*> 2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== - SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01 + EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01, + $ ZUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test ZUNHR_COL * - CALL ZUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL ZUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test ZUNHR_COL +* + CALL ZUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of ZCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f index 9fb3bf352..b7590a8ea 100644 --- a/lapack-netlib/TESTING/LIN/zunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR. -*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR +*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - COMPLEX*16, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, AF, M, R, M ) + CALL ZLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, R, M, AF, M ) + CALL ZLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/zunhr_col02.f b/lapack-netlib/TESTING/LIN/zunhr_col02.f new file mode 100644 index 000000000..c6e7f80cd --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b ZUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX*16 WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY + EXTERNAL DLAMCH, ZLANGE, ZLANSY +* .. +* .. External Subroutines .. + EXTERNAL ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT, + $ ZSCAL, ZGEMM, ZGEMQRT, ZHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL ZLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in ZLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* ZGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'ZGETSQRHRT' + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL ZLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = ZLANGE( '1', M, N, A, M, RWORK ) + RESID = ZLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = ZLANGE( '1', M, N, C, M, RWORK ) + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL ZLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = ZLANGE( '1', N, M, D, N, RWORK ) + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of ZUNHR_COL02 +* + END From db50b24a4a1c7d3a3e21953c81d1774cf3a24e21 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:55:15 +0200 Subject: [PATCH 211/681] Add entries for the new Householder Reconstruction functions from 3.9.1 --- cmake/lapacke.cmake | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 54a583887..340ea6d6c 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -114,6 +114,8 @@ set(CSRC lapacke_cgetrs_work.c lapacke_cgetsls.c lapacke_cgetsls_work.c + lapacke_cgetsqrhrt.c + lapacke_cgetsqrhrt_work.c lapacke_cggbak.c lapacke_cggbak_work.c lapacke_cggbal.c @@ -590,6 +592,8 @@ set(CSRC lapacke_cungrq_work.c lapacke_cungtr.c lapacke_cungtr_work.c + lapacke_cungtsqr_row.c + lapacke_cungtsqr_row_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -735,6 +739,8 @@ set(DSRC lapacke_dgetrs_work.c lapacke_dgetsls.c lapacke_dgetsls_work.c + lapacke_dgetsqrhrt.c + lapacke_dgetsqrhrt_work.c lapacke_dggbak.c lapacke_dggbak_work.c lapacke_dggbal.c @@ -862,6 +868,8 @@ set(DSRC lapacke_dorgrq_work.c lapacke_dorgtr.c lapacke_dorgtr_work.c + lapacke_dorgtsqr_row.c + lapacke_dorgtsqr_row_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1309,6 +1317,8 @@ set(SSRC lapacke_sgetrs_work.c lapacke_sgetsls.c lapacke_sgetsls_work.c + lapacke_sgetsqrhrt.c + lapacke_sgetsqrhrt_work.c lapacke_sggbak.c lapacke_sggbak_work.c lapacke_sggbal.c @@ -1435,6 +1445,8 @@ set(SSRC lapacke_sorgrq_work.c lapacke_sorgtr.c lapacke_sorgtr_work.c + lapacke_sorgtsqr_row.c + lapacke_sorgtsqr_row_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -1877,6 +1889,8 @@ set(ZSRC lapacke_zgetrs_work.c lapacke_zgetsls.c lapacke_zgetsls_work.c + lapacke_zgetsqrhrt.c + lapacke_zgetsqrhrt_work.c lapacke_zggbak.c lapacke_zggbak_work.c lapacke_zggbal.c @@ -2351,6 +2365,8 @@ set(ZSRC lapacke_zungrq_work.c lapacke_zungtr.c lapacke_zungtr_work.c + lapacke_zungtsqr_row.c + lapacke_zungtsqr_row_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c From fb7308b9b53dfeaf158af62f4d1f9d74440a9d26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:56:11 +0200 Subject: [PATCH 212/681] Add entries for the new Householder Reconstruction functions from 3.9.1 --- lapack-netlib/LAPACKE/include/lapack.h | 72 ++++++++++++++++++++++ lapack-netlib/LAPACKE/include/lapacke.h | 82 +++++++++++++++++++++++++ 2 files changed, 154 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 341efabda..ada1944b2 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -2941,6 +2941,42 @@ void LAPACK_zgetsls( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT) +void LAPACK_cgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT) +void LAPACK_dgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT) +void LAPACK_sgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT) +void LAPACK_zgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK) void LAPACK_cggbak( char const* job, char const* side, @@ -7251,6 +7287,24 @@ void LAPACK_sorgtr( float* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW) +void LAPACK_dorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + double* A, lapack_int const* lda, + double const* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW) +void LAPACK_sorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + float* A, lapack_int const* lda, + float const* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR) void LAPACK_dormbr( char const* vect, char const* side, char const* trans, @@ -13540,6 +13594,24 @@ void LAPACK_zungtr( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW) +void LAPACK_cungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW) +void LAPACK_zungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR) void LAPACK_cunmbr( char const* vect, char const* side, char const* trans, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index b280dde0a..5c129db91 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau ); +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ); +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ); + lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, const float* tau, @@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau ); +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ); + lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, lapack_int lda, @@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau, double* work, lapack_int lwork ); +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ); + lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, @@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n, const lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, @@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ); +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ); +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ); + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ); +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w ); lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, From d44434449798ecc9f02691668696c096a4877941 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 19:57:47 +0200 Subject: [PATCH 213/681] Add LAPACKE interfaces for the new Householder Reconstruction functions from 3.9.1 --- lapack-netlib/LAPACKE/src/Makefile | 16 +++ .../LAPACKE/src/lapacke_cgetsqrhrt.c | 80 +++++++++++++ .../LAPACKE/src/lapacke_cgetsqrhrt_work.c | 108 +++++++++++++++++ .../LAPACKE/src/lapacke_cungtsqr_row.c | 83 +++++++++++++ .../LAPACKE/src/lapacke_cungtsqr_row_work.c | 109 ++++++++++++++++++ .../LAPACKE/src/lapacke_dgetsqrhrt.c | 79 +++++++++++++ .../LAPACKE/src/lapacke_dgetsqrhrt_work.c | 106 +++++++++++++++++ .../LAPACKE/src/lapacke_dorgtsqr_row.c | 82 +++++++++++++ .../LAPACKE/src/lapacke_dorgtsqr_row_work.c | 108 +++++++++++++++++ .../LAPACKE/src/lapacke_sgetsqrhrt.c | 79 +++++++++++++ .../LAPACKE/src/lapacke_sgetsqrhrt_work.c | 106 +++++++++++++++++ .../LAPACKE/src/lapacke_sorgtsqr_row.c | 82 +++++++++++++ .../LAPACKE/src/lapacke_sorgtsqr_row_work.c | 108 +++++++++++++++++ .../LAPACKE/src/lapacke_zgetsqrhrt.c | 80 +++++++++++++ .../LAPACKE/src/lapacke_zgetsqrhrt_work.c | 108 +++++++++++++++++ .../LAPACKE/src/lapacke_zungtsqr_row.c | 83 +++++++++++++ .../LAPACKE/src/lapacke_zungtsqr_row_work.c | 109 ++++++++++++++++++ 17 files changed, 1526 insertions(+) create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index a602dd7a0..7f827e1c9 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -162,6 +162,8 @@ lapacke_cgetrs.o \ lapacke_cgetrs_work.o \ lapacke_cgetsls.o \ lapacke_cgetsls_work.o \ +lapacke_cgetsqrhrt.o \ +lapacke_cgetsqrhrt_work.o \ lapacke_cggbak.o \ lapacke_cggbak_work.o \ lapacke_cggbal.o \ @@ -634,6 +636,8 @@ lapacke_cungrq.o \ lapacke_cungrq_work.o \ lapacke_cungtr.o \ lapacke_cungtr_work.o \ +lapacke_cungtsqr_row.o \ +lapacke_cungtsqr_row_work.o \ lapacke_cunmbr.o \ lapacke_cunmbr_work.o \ lapacke_cunmhr.o \ @@ -778,6 +782,8 @@ lapacke_dgetrs.o \ lapacke_dgetrs_work.o \ lapacke_dgetsls.o \ lapacke_dgetsls_work.o \ +lapacke_dgetsqrhrt.o \ +lapacke_dgetsqrhrt_work.o \ lapacke_dggbak.o \ lapacke_dggbak_work.o \ lapacke_dggbal.o \ @@ -900,6 +906,8 @@ lapacke_dorgrq.o \ lapacke_dorgrq_work.o \ lapacke_dorgtr.o \ lapacke_dorgtr_work.o \ +lapacke_dorgtsqr_row.o \ +lapacke_dorgtsqr_row_work.o \ lapacke_dormbr.o \ lapacke_dormbr_work.o \ lapacke_dormhr.o \ @@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \ lapacke_sgetrs_work.o \ lapacke_sgetsls.o \ lapacke_sgetsls_work.o \ +lapacke_sgetsqrhrt.o \ +lapacke_sgetsqrhrt_work.o \ lapacke_sggbak.o \ lapacke_sggbak_work.o \ lapacke_sggbal.o \ @@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \ lapacke_sorgrq_work.o \ lapacke_sorgtr.o \ lapacke_sorgtr_work.o \ +lapacke_sorgtsqr_row.o \ +lapacke_sorgtsqr_row_work.o \ lapacke_sormbr.o \ lapacke_sormbr_work.o \ lapacke_sormhr.o \ @@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \ lapacke_zgetrs_work.o \ lapacke_zgetsls.o \ lapacke_zgetsls_work.o \ +lapacke_zgetsqrhrt.o \ +lapacke_zgetsqrhrt_work.o \ lapacke_zggbak.o \ lapacke_zggbak_work.o \ lapacke_zggbal.o \ @@ -2380,6 +2394,8 @@ lapacke_zungrq.o \ lapacke_zungrq_work.o \ lapacke_zungtr.o \ lapacke_zungtr_work.o \ +lapacke_zungtsqr_row.o \ +lapacke_zungtsqr_row_work.o \ lapacke_zunmbr.o \ lapacke_zunmbr_work.o \ lapacke_zunmhr.o \ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c new file mode 100644 index 000000000..0e67e0b83 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c new file mode 100644 index 000000000..598f193e6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c new file mode 100644 index 000000000..bb551fcbc --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c new file mode 100644 index 000000000..96b18ab13 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c new file mode 100644 index 000000000..cf0e3200c --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c new file mode 100644 index 000000000..f91887ffe --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c new file mode 100644 index 000000000..1da3405a8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c new file mode 100644 index 000000000..e16467f3a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c new file mode 100644 index 000000000..759afce48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c new file mode 100644 index 000000000..40193008d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c new file mode 100644 index 000000000..350783a78 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c new file mode 100644 index 000000000..a66f70b52 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c new file mode 100644 index 000000000..53557c92d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c new file mode 100644 index 000000000..a6825df56 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c new file mode 100644 index 000000000..71418fb84 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c new file mode 100644 index 000000000..909855864 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file From d00709e0164e3fc0b92d347c6dc3b8d4cf8a5aec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 20:47:58 +0200 Subject: [PATCH 214/681] Add files via upload --- lapack-netlib/SRC/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 470b5326e..d1ee96667 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -289,7 +289,7 @@ CLASRC_O = \ cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \ cgelqt.o cgelqt3.o cgemlqt.o \ - cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ + cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ cgelq.o claswlq.o clamswlq.o cgemlq.o \ ctplqt.o ctplqt2.o ctpmlqt.o \ cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \ From d8d7bd33cb93575c8a00be055fe58095fb1ead78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 23:46:55 +0200 Subject: [PATCH 215/681] Update Changelog for 0.3.15 --- Changelog.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 5662bc5c6..6c5cf573e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.15 + 2-May-2021 + +common: + - imported improvements and bugfixes from Reference-LAPACK 3.9.1 + - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537 + - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation + - fixed a sequence problem in the generation of softlinks to the library in GMAKE + +RISC V: + - fixed compilation on RISCV (missing entry in getarch) + - fixed a potential division by zero in CROTG and ZROTG + +POWER: + - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler + - improved CGEMM, DGEMM and ZGEMM performance on POWER10 + - added an optimized ZGEMV kernel for POWER10 + - fixed a potential division by zero in CROTG and ZROTG + +x86_64: + - added support for Intel Control-flow Enforcement Technology (CET) + - reverted the DOMATCOPY_RT code to the generic C version + - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14 + - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH + - added support for compilation of the benchmarks on older OSX versions + - fix propagation of the NO_AVX512 option in CMAKE builds + - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows + - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX) + - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512 + +ARM: + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +ARM64: + - fixed spurious reads outside the array in the SGEMM tcopy macro + - fixed a potential division by zero in CROTG and ZROTG + - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14) + +MIPS + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +MIPS64: + - fixed a potential division by zero in CROTG and ZROTG + +SPARC: + - fixed a potential division by zero in CROTG and ZROTG + ==================================================================== Version 0.3.14 17-Mar-2021 From 4c033730bb02860cf9a9cd57af4f2cabb43111fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 23:49:49 +0200 Subject: [PATCH 216/681] Update version to 0.3.15 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d0313c842..15f6ba2c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 14.dev) +set(OpenBLAS_PATCH_VERSION 15) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 904f9a267dddb30e9f187e57231ed160ab2f2704 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 May 2021 23:50:22 +0200 Subject: [PATCH 217/681] Update version to 0.3.15 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 38d0161a3..0c138331e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.14.dev +VERSION = 0.3.15 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 380f955078eee43d729453f011388ce51e5dc675 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 May 2021 00:00:29 +0200 Subject: [PATCH 218/681] Update version to 0.3.15.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 15f6ba2c2..0863163c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 15) +set(OpenBLAS_PATCH_VERSION 15.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 9721b57ecfd194f1a4aaa08d715735cd9e8ad8b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 3 May 2021 00:01:08 +0200 Subject: [PATCH 219/681] Update version to 0.3.15.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0c138331e..64c8ff778 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.15 +VERSION = 0.3.15.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 8b599836db17451bf28e3ad74b0e26474af0c1b4 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 4 May 2021 13:55:02 -0500 Subject: [PATCH 220/681] Add error message token for SBGEMM in gemm.c --- interface/gemm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 6fde69049..cd5d00589 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -49,6 +49,8 @@ #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMM " #else #define ERROR_NAME "SGEMM " #endif From 206e03fdaca9f412e8a48963816f3a46e13d45b2 Mon Sep 17 00:00:00 2001 From: drhpc Date: Tue, 4 May 2021 21:02:07 +0200 Subject: [PATCH 221/681] Delete lapack_wrappers.c.orig This looks like a leftover from patching and confuses further patching;-) --- relapack/src/lapack_wrappers.c.orig | 607 ---------------------------- 1 file changed, 607 deletions(-) delete mode 100644 relapack/src/lapack_wrappers.c.orig diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig deleted file mode 100644 index d89d2fe2f..000000000 --- a/relapack/src/lapack_wrappers.c.orig +++ /dev/null @@ -1,607 +0,0 @@ -#include "relapack.h" - -//////////// -// XLAUUM // -//////////// - -#if INCLUDE_SLAUUM -void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_slauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DLAUUM -void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dlauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CLAUUM -void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_clauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZLAUUM -void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zlauum(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XSYGST // -//////////// - -#if INCLUDE_SSYGST -void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_DSYGST -void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_CSYGST -void LAPACK(csygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_ZSYGST -void LAPACK(zsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - - -//////////// -// XTRTRI // -//////////// - -#if INCLUDE_STRTRI -void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_strtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_DTRTRI -void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_CTRTRI -void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZTRTRI -void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); -} -#endif - - -//////////// -// XPOTRF // -//////////// - -#if INCLUDE_SPOTRF -void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_spotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DPOTRF -void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CPOTRF -void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_cpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZPOTRF -void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zpotrf(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XPBTRF // -//////////// - -#if INCLUDE_SPBTRF -void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_DPBTRF -void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_CPBTRF -void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_ZPBTRF -void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - - -//////////// -// XSYTRF // -//////////// - -#if INCLUDE_SSYTRF -void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF -void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF -void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF -void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF -void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF -void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_SSYTRF_ROOK -void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF_ROOK -void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF_ROOK -void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF_ROOK -void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF_ROOK -void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF_ROOK -void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - - -//////////// -// XGETRF // -//////////// - -#if INCLUDE_SGETRF -void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_DGETRF -void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_CGETRF -void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_ZGETRF -void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); -} -#endif - - -//////////// -// XGBTRF // -//////////// - -#if INCLUDE_SGBTRF -void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_DGBTRF -void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_CGBTRF -void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_ZGBTRF -void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - - -//////////// -// XTRSYL // -//////////// - -#if INCLUDE_STRSYL -void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_DTRSYL -void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_CTRSYL -void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_ZTRSYL -void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - - -//////////// -// XTGSYL // -//////////// - -#if INCLUDE_STGSYL -void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_DTGSYL -void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_CTGSYL -void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_ZTGSYL -void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - - -//////////// -// XGEMMT // -//////////// - -#if INCLUDE_SGEMMT -void LAPACK(sgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DGEMMT -void LAPACK(dgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CGEMMT -void LAPACK(cgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZGEMMT -void LAPACK(zgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); -} -#endif From c0ca63ea4672c3b013136ef54a69e5ab967be270 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 5 May 2021 14:55:36 +0200 Subject: [PATCH 222/681] Fix missing conditionals for non-SKX kernels --- kernel/x86_64/sgemv_n_4.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index bc006bf3c..06de28d97 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -417,7 +417,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 2 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 2, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); +#endif a_ptr += lda*2; x_ptr += 2; } @@ -425,7 +429,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { +#ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL sgemv_kernel_n_64(NB, 1, alpha, a_ptr, lda, x_ptr, ybuffer); +#else + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); +#endif /* a_ptr += lda; x_ptr += 1a; */ From bda8820da73193d4115016c571f7898d53047f7a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 May 2021 20:20:08 +0200 Subject: [PATCH 223/681] Use percent instead of ampersand as placeholder for substitutions --- f_check | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/f_check b/f_check index 2c0d7fcb9..4825fb09a 100644 --- a/f_check +++ b/f_check @@ -314,11 +314,11 @@ if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; - $link =~ s/\-R\s*/\-rpath\@/g; + $link =~ s/\-R\s*/\-rpath\%/g; - $link =~ s/\-rpath\s+/\-rpath\@/g; + $link =~ s/\-rpath\s+/\-rpath\%/g; - $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; + $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @@ -344,13 +344,13 @@ if ($link ne "") { } - if ($flags =~ /^\-rpath\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /^\-rpath-link\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath-link\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { From ec7d6c02bcdbd8d0f2986136a21f79f70417efe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 10 May 2021 08:02:01 +0200 Subject: [PATCH 224/681] Add an Android crossbuild on OSX to Azure CI (#3224) * Add an Android crossbuild on OSX --- azure-pipelines.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56a3fd4ae..4b6b2b0e6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -137,3 +137,13 @@ jobs: source /opt/intel/oneapi/setvars.sh make CC=/usr/local/opt/llvm/bin/clang FC=ifort +- job: OSX_NDK_ARMV7 + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + brew install --cask android-ndk + export ANDROID_NDK_HOME=/usr/local/share/android-ndk + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + From bd60fb6ffc9d14834ed03bed0f7e6e44126c6c05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 13 May 2021 23:05:00 +0200 Subject: [PATCH 225/681] filter out -mavx flag on zgemm kernels as it can cause problems with older gcc --- kernel/Makefile.L3 | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d8d739965..be10ee018 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -819,7 +819,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -829,7 +829,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -839,7 +839,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -849,7 +849,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) @@ -1045,7 +1045,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1055,7 +1055,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1065,7 +1065,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1075,7 +1075,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1085,7 +1085,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1095,7 +1095,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1105,7 +1105,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1115,7 +1115,7 @@ ifeq ($(OS), AIX) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif else @@ -1187,28 +1187,28 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) - $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif From 8b90e5f2029f21eecbcf961164516cd69da16e98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 15:06:44 +0200 Subject: [PATCH 226/681] Drop redundant inclusion of complex.h --- kernel/x86_64/cdot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index f2bf19dcd..654cd351a 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -27,7 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#include #if defined(BULLDOZER) From 73f637e5848ae19b90f522222f03df875f21468f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 15:08:12 +0200 Subject: [PATCH 227/681] Support compilation with pre-C99 versions of MSVC --- utest/ctest.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/utest/ctest.h b/utest/ctest.h index d316b1494..037f7f28d 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -28,7 +28,10 @@ #define WEAK #endif +#ifndef __MSC_VER #include /* intmax_t, uintmax_t, PRI* */ +#endif + #include /* size_t */ typedef void (*SetupFunc)(void*); @@ -72,6 +75,13 @@ struct ctest { #define __CTEST_NO_TIME #define CTEST_NO_COLORS +#if __MSC_VER >= 1500 +#include +#else +#include +#define CTEST_NO_INTTYPES +#endif + #ifndef CTEST_ADD_TESTS_MANUALLY #pragma section(".ctest$a") #pragma section(".ctest$u") @@ -480,11 +490,19 @@ void assert_data(const unsigned char* exp, size_t expsize, const char* caller, int line) { size_t i; if (expsize != realsize) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#else + CTEST_ERR("%s:%d expected %u bytes, got %u", caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#endif } for (i=0; i exp2) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); +#else + CTEST_ERR("%s:%d expected %d-%d, got %d", caller, line, exp1, exp2, real); +#endif } } From eef1c42f03693da6d4f5be91865500fef6803dcf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:53:03 +0200 Subject: [PATCH 228/681] Convert ?chkaa to use dynamic allocation for the larger arrays --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 8 +- lapack-netlib/TESTING/LIN/Makefile | 8 +- lapack-netlib/TESTING/LIN/cchkaa.F | 1237 +++++++++++++++++++++ lapack-netlib/TESTING/LIN/dchkaa.F | 1080 ++++++++++++++++++ lapack-netlib/TESTING/LIN/schkaa.F | 1074 ++++++++++++++++++ lapack-netlib/TESTING/LIN/zchkaa.F | 1271 ++++++++++++++++++++++ 6 files changed, 4670 insertions(+), 8 deletions(-) create mode 100644 lapack-netlib/TESTING/LIN/cchkaa.F create mode 100644 lapack-netlib/TESTING/LIN/dchkaa.F create mode 100644 lapack-netlib/TESTING/LIN/schkaa.F create mode 100644 lapack-netlib/TESTING/LIN/zchkaa.F diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 309ed7e77..fc55b8a96 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -6,7 +6,7 @@ set(SCLNTST slaord.f) set(DZLNTST dlaord.f) -set(SLINTST schkaa.f +set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f schkpt.f schkq3.f schkql.f schkqr.f schkrq.f @@ -51,7 +51,7 @@ else() serrvx.f serrge.f serrsy.f serrpo.f) endif() -set(CLINTST cchkaa.f +set(CLINTST cchkaa.F cchkeq.f cchkgb.f cchkge.f cchkgt.f cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f @@ -107,7 +107,7 @@ else() cerrvx.f cerrge.f cerrhe.f cerrsy.f cerrpo.f) endif() -set(DLINTST dchkaa.f +set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f @@ -153,7 +153,7 @@ else() derrvx.f derrge.f derrsy.f derrpo.f) endif() -set(ZLINTST zchkaa.f +set(ZLINTST zchkaa.F zchkeq.f zchkgb.f zchkge.f zchkgt.f zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 674265816..54b26455e 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -317,13 +317,13 @@ cleanobj: cleanexe: rm -f xlintst* -schkaa.o: schkaa.f +schkaa.o: schkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkaa.o: dchkaa.f +dchkaa.o: dchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkaa.o: cchkaa.f +cchkaa.o: cchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkaa.o: zchkaa.f +zchkaa.o: zchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< .NOTPARALLEL: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F new file mode 100644 index 000000000..ec1534ed4 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -0,0 +1,1237 @@ +*> \brief \b CCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM CCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CCHKAA is the main test program for the COMPLEX linear equation +*> routines. +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 42 lines: +*> Data file for testing COMPLEX LAPACK linear equation routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 30.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> CGE 11 List types on next line if 0 < NTYPES < 11 +*> CGB 8 List types on next line if 0 < NTYPES < 8 +*> CGT 12 List types on next line if 0 < NTYPES < 12 +*> CPO 9 List types on next line if 0 < NTYPES < 9 +*> CPO 9 List types on next line if 0 < NTYPES < 9 +*> CPP 9 List types on next line if 0 < NTYPES < 9 +*> CPB 8 List types on next line if 0 < NTYPES < 8 +*> CPT 12 List types on next line if 0 < NTYPES < 12 +*> CHE 10 List types on next line if 0 < NTYPES < 10 +*> CHR 10 List types on next line if 0 < NTYPES < 10 +*> CHK 10 List types on next line if 0 < NTYPES < 10 +*> CHA 10 List types on next line if 0 < NTYPES < 10 +*> CH2 10 List types on next line if 0 < NTYPES < 10 +*> CSA 11 List types on next line if 0 < NTYPES < 10 +*> CS2 11 List types on next line if 0 < NTYPES < 10 +*> CHP 10 List types on next line if 0 < NTYPES < 10 +*> CSY 11 List types on next line if 0 < NTYPES < 11 +*> CSK 11 List types on next line if 0 < NTYPES < 11 +*> CSR 11 List types on next line if 0 < NTYPES < 11 +*> CSP 11 List types on next line if 0 < NTYPES < 11 +*> CTR 18 List types on next line if 0 < NTYPES < 18 +*> CTP 18 List types on next line if 0 < NTYPES < 18 +*> CTB 17 List types on next line if 0 < NTYPES < 17 +*> CQR 8 List types on next line if 0 < NTYPES < 8 +*> CRQ 8 List types on next line if 0 < NTYPES < 8 +*> CLQ 8 List types on next line if 0 < NTYPES < 8 +*> CQL 8 List types on next line if 0 < NTYPES < 8 +*> CQP 6 List types on next line if 0 < NTYPES < 6 +*> CTZ 3 List types on next line if 0 < NTYPES < 3 +*> CLS 6 List types on next line if 0 < NTYPES < 6 +*> CEQ +*> CQT +*> CQX +*> CTS +*> CHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + PROGRAM CCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + REAL EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + REAL S( 2*NMAX ) + COMPLEX E( NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, + $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, + $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, + $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, + $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, + $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, + $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, + $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, + $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, + $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, + $ CCHKQRT, CCHKQRTP +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA THREQ / 2.0 / , INTSTR / '0123456789' / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = SECOND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) + NRHS = NSVAL( 1 ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Complex precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN +* +* HE: Hermitian indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN +* +* HR: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN +* +* HK: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than HR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN +* +* HA: Hermitian matrices, +* Aasen Algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN +* +* H2: Hermitian matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHE_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN +* +* HP: Hermitian indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices with Aasen's algorithm, +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* S2: symmetric indefinite matrices with Aasen's algorithm +* 2 stage +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL CDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL CCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL CCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ S( 1 ), S( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL CCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL CCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL CCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL CCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL CCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the COMPLEX LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of CCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F new file mode 100644 index 000000000..ef9d7808c --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -0,0 +1,1080 @@ +*> \brief \b DCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM DCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DCHKAA is the main test program for the DOUBLE PRECISION LAPACK +*> linear equation routines +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 40 lines: +*> Data file for testing DOUBLE PRECISION LAPACK linear eqn. routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 20.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> DGE 11 List types on next line if 0 < NTYPES < 11 +*> DGB 8 List types on next line if 0 < NTYPES < 8 +*> DGT 12 List types on next line if 0 < NTYPES < 12 +*> DPO 9 List types on next line if 0 < NTYPES < 9 +*> DPS 9 List types on next line if 0 < NTYPES < 9 +*> DPP 9 List types on next line if 0 < NTYPES < 9 +*> DPB 8 List types on next line if 0 < NTYPES < 8 +*> DPT 12 List types on next line if 0 < NTYPES < 12 +*> DSY 10 List types on next line if 0 < NTYPES < 10 +*> DSR 10 List types on next line if 0 < NTYPES < 10 +*> DSK 10 List types on next line if 0 < NTYPES < 10 +*> DSA 10 List types on next line if 0 < NTYPES < 10 +*> DS2 10 List types on next line if 0 < NTYPES < 10 +*> DSP 10 List types on next line if 0 < NTYPES < 10 +*> DTR 18 List types on next line if 0 < NTYPES < 18 +*> DTP 18 List types on next line if 0 < NTYPES < 18 +*> DTB 17 List types on next line if 0 < NTYPES < 17 +*> DQR 8 List types on next line if 0 < NTYPES < 8 +*> DRQ 8 List types on next line if 0 < NTYPES < 8 +*> DLQ 8 List types on next line if 0 < NTYPES < 8 +*> DQL 8 List types on next line if 0 < NTYPES < 8 +*> DQP 6 List types on next line if 0 < NTYPES < 6 +*> DTZ 3 List types on next line if 0 < NTYPES < 3 +*> DLS 6 List types on next line if 0 < NTYPES < 6 +*> DEQ +*> DQT +*> DQX +*> DTQ +*> DXQ +*> DTS +*> DHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + PROGRAM DCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, + $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, + $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, + $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, + $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, + $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, + $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, + $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, + $ DCHKLQT,DCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* .. +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* +* .. Executable Statements .. +* + S1 = DSECND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) THEN + NMATS = MATMAX + GO TO 130 + END IF + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) + NRHS = NSVAL( 1 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Double precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL DDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL DCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ RWORK, RWORK( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL DCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL DCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL DCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL DCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL DCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL DCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE + +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the DOUBLE PRECISION LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of DCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F new file mode 100644 index 000000000..a5b826d06 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -0,0 +1,1074 @@ +*> \brief \b SCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM SCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SCHKAA is the main test program for the REAL LAPACK +*> linear equation routines +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 40 lines: +*> Data file for testing REAL LAPACK linear eqn. routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 20.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> SGE 11 List types on next line if 0 < NTYPES < 11 +*> SGB 8 List types on next line if 0 < NTYPES < 8 +*> SGT 12 List types on next line if 0 < NTYPES < 12 +*> SPO 9 List types on next line if 0 < NTYPES < 9 +*> SPS 9 List types on next line if 0 < NTYPES < 9 +*> SPP 9 List types on next line if 0 < NTYPES < 9 +*> SPB 8 List types on next line if 0 < NTYPES < 8 +*> SPT 12 List types on next line if 0 < NTYPES < 12 +*> SSY 10 List types on next line if 0 < NTYPES < 10 +*> SSR 10 List types on next line if 0 < NTYPES < 10 +*> SSK 10 List types on next line if 0 < NTYPES < 10 +*> SSA 10 List types on next line if 0 < NTYPES < 10 +*> SS2 10 List types on next line if 0 < NTYPES < 10 +*> SSP 10 List types on next line if 0 < NTYPES < 10 +*> STR 18 List types on next line if 0 < NTYPES < 18 +*> STP 18 List types on next line if 0 < NTYPES < 18 +*> STB 17 List types on next line if 0 < NTYPES < 17 +*> SQR 8 List types on next line if 0 < NTYPES < 8 +*> SRQ 8 List types on next line if 0 < NTYPES < 8 +*> SLQ 8 List types on next line if 0 < NTYPES < 8 +*> SQL 8 List types on next line if 0 < NTYPES < 8 +*> SQP 6 List types on next line if 0 < NTYPES < 6 +*> STZ 3 List types on next line if 0 < NTYPES < 3 +*> SLS 6 List types on next line if 0 < NTYPES < 6 +*> SEQ +*> SQT +*> SQX +*> STS +*> SHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + PROGRAM SCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + REAL EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + REAL E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + REAL SECOND, SLAMCH + EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, + $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, + $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, + $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, + $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, + $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, + $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, + $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, + $ SCHKLQT, SCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / CLAENV / IPARMS + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT +* .. +* .. Data statements .. + DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = SECOND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = SLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = SLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = SLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) THEN + NMATS = MATMAX + GO TO 130 + END IF + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) + NRHS = NSVAL( 1 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Single precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* SA: symmetric indefinite matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL SDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL SCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 3 ), WORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ RWORK, RWORK( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL SCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL SCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL SCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL SCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL SCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL SCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = SECOND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the REAL LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of SCHKAA +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F new file mode 100644 index 000000000..a118515a5 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -0,0 +1,1271 @@ +*> \brief \b ZCHKAA +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* PROGRAM ZCHKAA +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZCHKAA is the main test program for the COMPLEX*16 linear equation +*> routines. +*> +*> The program must be driven by a short data file. The first 15 records +*> (not including the first comment line) specify problem dimensions +*> and program options using list-directed input. The remaining lines +*> specify the LAPACK test paths and the number of matrix types to use +*> in testing. An annotated example of a data file can be obtained by +*> deleting the first 3 characters from the following 42 lines: +*> Data file for testing COMPLEX*16 LAPACK linear equation routines +*> 7 Number of values of M +*> 0 1 2 3 5 10 16 Values of M (row dimension) +*> 7 Number of values of N +*> 0 1 2 3 5 10 16 Values of N (column dimension) +*> 1 Number of values of NRHS +*> 2 Values of NRHS (number of right hand sides) +*> 5 Number of values of NB +*> 1 3 3 3 20 Values of NB (the blocksize) +*> 1 0 5 9 1 Values of NX (crossover point) +*> 3 Number of values of RANK +*> 30 50 90 Values of rank (as a % of N) +*> 30.0 Threshold value of test ratio +*> T Put T to test the LAPACK routines +*> T Put T to test the driver routines +*> T Put T to test the error exits +*> ZGE 11 List types on next line if 0 < NTYPES < 11 +*> ZGB 8 List types on next line if 0 < NTYPES < 8 +*> ZGT 12 List types on next line if 0 < NTYPES < 12 +*> ZPO 9 List types on next line if 0 < NTYPES < 9 +*> ZPS 9 List types on next line if 0 < NTYPES < 9 +*> ZPP 9 List types on next line if 0 < NTYPES < 9 +*> ZPB 8 List types on next line if 0 < NTYPES < 8 +*> ZPT 12 List types on next line if 0 < NTYPES < 12 +*> ZHE 10 List types on next line if 0 < NTYPES < 10 +*> ZHR 10 List types on next line if 0 < NTYPES < 10 +*> ZHK 10 List types on next line if 0 < NTYPES < 10 +*> ZHA 10 List types on next line if 0 < NTYPES < 10 +*> ZH2 10 List types on next line if 0 < NTYPES < 10 +*> ZSA 11 List types on next line if 0 < NTYPES < 10 +*> ZS2 11 List types on next line if 0 < NTYPES < 10 +*> ZHP 10 List types on next line if 0 < NTYPES < 10 +*> ZSY 11 List types on next line if 0 < NTYPES < 11 +*> ZSR 11 List types on next line if 0 < NTYPES < 11 +*> ZSK 11 List types on next line if 0 < NTYPES < 11 +*> ZSP 11 List types on next line if 0 < NTYPES < 11 +*> ZTR 18 List types on next line if 0 < NTYPES < 18 +*> ZTP 18 List types on next line if 0 < NTYPES < 18 +*> ZTB 17 List types on next line if 0 < NTYPES < 17 +*> ZQR 8 List types on next line if 0 < NTYPES < 8 +*> ZRQ 8 List types on next line if 0 < NTYPES < 8 +*> ZLQ 8 List types on next line if 0 < NTYPES < 8 +*> ZQL 8 List types on next line if 0 < NTYPES < 8 +*> ZQP 6 List types on next line if 0 < NTYPES < 6 +*> ZTZ 3 List types on next line if 0 < NTYPES < 3 +*> ZLS 6 List types on next line if 0 < NTYPES < 6 +*> ZEQ +*> ZQT +*> ZQX +*> ZTS +*> ZHH +*> \endverbatim +* +* Parameters: +* ========== +* +*> \verbatim +*> NMAX INTEGER +*> The maximum allowable value for M and N. +*> +*> MAXIN INTEGER +*> The number of different values that can be used for each of +*> M, N, NRHS, NB, NX and RANK +*> +*> MAXRHS INTEGER +*> The maximum number of right hand sides +*> +*> MATMAX INTEGER +*> The maximum number of matrix types to use for testing +*> +*> NIN INTEGER +*> The unit number for input +*> +*> NOUT INTEGER +*> The unit number for output +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + PROGRAM ZCHKAA +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* ===================================================================== +* +* .. Parameters .. + INTEGER NMAX + PARAMETER ( NMAX = 132 ) + INTEGER MAXIN + PARAMETER ( MAXIN = 12 ) + INTEGER MAXRHS + PARAMETER ( MAXRHS = 16 ) + INTEGER MATMAX + PARAMETER ( MATMAX = 30 ) + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER KDMAX + PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) +* .. +* .. Local Scalars .. + LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR + CHARACTER C1 + CHARACTER*2 C2 + CHARACTER*3 PATH + CHARACTER*10 INTSTR + CHARACTER*72 ALINE + INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, + $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH +* .. +* .. Local Arrays .. + LOGICAL DOTYPE( MATMAX ) + INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), + $ NBVAL( MAXIN ), NBVAL2( MAXIN ), + $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), + $ RANKVAL( MAXIN ), PIV( NMAX ) + DOUBLE PRECISION S( 2*NMAX ) + COMPLEX*16 E( NMAX ) +* +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK +* .. +* .. External Functions .. + LOGICAL LSAME, LSAMEN + DOUBLE PRECISION DLAMCH, DSECND + EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND +* .. +* .. External Subroutines .. + EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, + $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, + $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, + $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, + $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, + $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, + $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, + $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, + $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, + $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, + $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, + $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR +* .. +* .. Scalars in Common .. + LOGICAL LERR, OK + CHARACTER*32 SRNAMT + INTEGER INFOT, NUNIT +* .. +* .. Arrays in Common .. + INTEGER IPARMS( 100 ) +* .. +* .. Common blocks .. + COMMON / INFOC / INFOT, NUNIT, OK, LERR + COMMON / SRNAMC / SRNAMT + COMMON / CLAENV / IPARMS +* .. +* .. Data statements .. + DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* +* .. Allocate memory dynamically .. + ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. +* .. Executable Statements .. +* + S1 = DSECND( ) + LDA = NMAX + FATAL = .FALSE. +* +* Read a dummy line. +* + READ( NIN, FMT = * ) +* +* Report values of parameters. +* + CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) + WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH +* +* Read the values of M +* + READ( NIN, FMT = * )NM + IF( NM.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 + NM = 0 + FATAL = .TRUE. + ELSE IF( NM.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN + NM = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) + DO 10 I = 1, NM + IF( MVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( MVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX + FATAL = .TRUE. + END IF + 10 CONTINUE + IF( NM.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) +* +* Read the values of N +* + READ( NIN, FMT = * )NN + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 + NN = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN + NN = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) + DO 20 I = 1, NN + IF( NVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NVAL( I ).GT.NMAX ) THEN + WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX + FATAL = .TRUE. + END IF + 20 CONTINUE + IF( NN.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) +* +* Read the values of NRHS +* + READ( NIN, FMT = * )NNS + IF( NNS.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 + NNS = 0 + FATAL = .TRUE. + ELSE IF( NNS.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN + NNS = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) + DO 30 I = 1, NNS + IF( NSVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN + WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS + FATAL = .TRUE. + END IF + 30 CONTINUE + IF( NNS.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) +* +* Read the values of NB +* + READ( NIN, FMT = * )NNB + IF( NNB.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 + NNB = 0 + FATAL = .TRUE. + ELSE IF( NNB.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN + NNB = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) + DO 40 I = 1, NNB + IF( NBVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 + FATAL = .TRUE. + END IF + 40 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) +* +* Set NBVAL2 to be the set of unique values of NB +* + NNB2 = 0 + DO 60 I = 1, NNB + NB = NBVAL( I ) + DO 50 J = 1, NNB2 + IF( NB.EQ.NBVAL2( J ) ) + $ GO TO 60 + 50 CONTINUE + NNB2 = NNB2 + 1 + NBVAL2( NNB2 ) = NB + 60 CONTINUE +* +* Read the values of NX +* + READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) + DO 70 I = 1, NNB + IF( NXVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 + FATAL = .TRUE. + END IF + 70 CONTINUE + IF( NNB.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) +* +* Read the values of RANKVAL +* + READ( NIN, FMT = * )NRANK + IF( NN.LT.1 ) THEN + WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 + NRANK = 0 + FATAL = .TRUE. + ELSE IF( NN.GT.MAXIN ) THEN + WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN + NRANK = 0 + FATAL = .TRUE. + END IF + READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) + DO I = 1, NRANK + IF( RANKVAL( I ).LT.0 ) THEN + WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 + FATAL = .TRUE. + ELSE IF( RANKVAL( I ).GT.100 ) THEN + WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 + FATAL = .TRUE. + END IF + END DO + IF( NRANK.GT.0 ) + $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', + $ ( RANKVAL( I ), I = 1, NRANK ) +* +* Read the threshold value for the test ratios. +* + READ( NIN, FMT = * )THRESH + WRITE( NOUT, FMT = 9992 )THRESH +* +* Read the flag that indicates whether to test the LAPACK routines. +* + READ( NIN, FMT = * )TSTCHK +* +* Read the flag that indicates whether to test the driver routines. +* + READ( NIN, FMT = * )TSTDRV +* +* Read the flag that indicates whether to test the error exits. +* + READ( NIN, FMT = * )TSTERR +* + IF( FATAL ) THEN + WRITE( NOUT, FMT = 9999 ) + STOP + END IF +* +* Calculate and print the machine dependent constants. +* + EPS = DLAMCH( 'Underflow threshold' ) + WRITE( NOUT, FMT = 9991 )'underflow', EPS + EPS = DLAMCH( 'Overflow threshold' ) + WRITE( NOUT, FMT = 9991 )'overflow ', EPS + EPS = DLAMCH( 'Epsilon' ) + WRITE( NOUT, FMT = 9991 )'precision', EPS + WRITE( NOUT, FMT = * ) + NRHS = NSVAL( 1 ) +* + 80 CONTINUE +* +* Read a test path and the number of matrix types to use. +* + READ( NIN, FMT = '(A72)', END = 140 )ALINE + PATH = ALINE( 1: 3 ) + NMATS = MATMAX + I = 3 + 90 CONTINUE + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + IF( ALINE( I: I ).EQ.' ' ) + $ GO TO 90 + NMATS = 0 + 100 CONTINUE + C1 = ALINE( I: I ) + DO 110 K = 1, 10 + IF( C1.EQ.INTSTR( K: K ) ) THEN + IC = K - 1 + GO TO 120 + END IF + 110 CONTINUE + GO TO 130 + 120 CONTINUE + NMATS = NMATS*10 + IC + I = I + 1 + IF( I.GT.72 ) + $ GO TO 130 + GO TO 100 + 130 CONTINUE + C1 = PATH( 1: 1 ) + C2 = PATH( 2: 3 ) +* +* Check first character for correct precision. +* + IF( .NOT.LSAME( C1, 'Zomplex precision' ) ) THEN + WRITE( NOUT, FMT = 9990 )PATH +* + ELSE IF( NMATS.LE.0 ) THEN +* +* Check for a positive number of tests requested. +* + WRITE( NOUT, FMT = 9989 )PATH +* + ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN +* +* GE: general matrices +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN +* +* GB: general banded matrices +* + LA = ( 2*KDMAX+1 )*NMAX + LAFAC = ( 3*KDMAX+1 )*NMAX + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, + $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN +* +* GT: general tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN +* +* PO: positive definite matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN +* +* PS: positive semi-definite matrices +* + NTYPES = 9 +* + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, + $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN +* +* PP: positive definite packed matrices +* + NTYPES = 9 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN +* +* PB: positive definite banded matrices +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, + $ RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN +* +* PT: positive definite tridiagonal matrices +* + NTYPES = 12 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN +* +* HE: Hermitian indefinite matrices +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF + + ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN +* +* HR: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN +* +* HK: Hermitian indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than HR path version. +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_RK ( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN +* +* HA: Hermitian matrices, +* Aasen Algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN +* +* H2: Hermitian matrices, +* with partial (Aasen's) pivoting algorithm +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, + $ NNS, NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHE_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN +* +* HP: Hermitian indefinite packed matrices +* + NTYPES = 10 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN +* +* SY: symmetric indefinite matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN +* +* SR: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN +* +* SK: symmetric indefinite matrices, +* with bounded Bunch-Kaufman (rook) pivoting algorithm, +* different matrix storage format than SR path version. +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN +* +* SA: symmetric indefinite matrices with Aasen's algorithm, +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), + $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN +* +* S2: symmetric indefinite matrices with Aasen's algorithm +* 2 stage +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, + $ NSVAL, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSY_AA_2STAGE( + $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, + $ RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN +* +* SP: symmetric indefinite packed matrices, +* with partial (Bunch-Kaufman) pivoting algorithm +* + NTYPES = 11 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + IF( TSTDRV ) THEN + CALL ZDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, + $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9988 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN +* +* TR: triangular matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, + $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN +* +* TP: triangular packed matrices +* + NTYPES = 18 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN +* +* TB: triangular banded matrices +* + NTYPES = 17 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, + $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), + $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN +* +* QR: QR factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN +* +* LQ: LQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN +* +* QL: QL factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN +* +* RQ: RQ factorization +* + NTYPES = 8 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), + $ WORK, RWORK, IWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN +* +* EQ: Equilibration routines for general and positive definite +* matrices (THREQ should be between 2 and 10) +* + IF( TSTCHK ) THEN + CALL ZCHKEQ( THREQ, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN +* +* TZ: Trapezoidal matrix +* + NTYPES = 3 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, + $ A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN +* +* QP: QR factorization with pivoting +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTCHK ) THEN + CALL ZCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, + $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), + $ B( 1, 1 ), WORK, RWORK, IWORK, + $ NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN +* +* LS: Least squares drivers +* + NTYPES = 6 + CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) +* + IF( TSTDRV ) THEN + CALL ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, + $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), + $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), + $ S( 1 ), S( NMAX+1 ), NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* +* + ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN +* +* QT: QRT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN +* +* QX: QRT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN +* +* TQ: LQT routines for general matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN +* +* XQ: LQT routines for triangular-pentagonal matrices +* + IF( TSTCHK ) THEN + CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN +* +* TS: QR routines for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 )PATH + END IF +* + ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN +* +* HH: Householder reconstruction for tall-skinny matrices +* + IF( TSTCHK ) THEN + CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, + $ NBVAL, NOUT ) + ELSE + WRITE( NOUT, FMT = 9989 ) PATH + END IF +* + ELSE +* + WRITE( NOUT, FMT = 9990 )PATH + END IF +* +* Go back to get another input line. +* + GO TO 80 +* +* Branch to this line when the last record is read. +* + 140 CONTINUE + CLOSE ( NIN ) + S2 = DSECND( ) + WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) +* + 9999 FORMAT( / ' Execution not attempted due to input errors' ) + 9998 FORMAT( / ' End of tests' ) + 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) + 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', + $ I6 ) + 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', + $ I6 ) + 9994 FORMAT( ' Tests of the COMPLEX*16 LAPACK routines ', + $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, + $ / / ' The following parameter values will be used:' ) + 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) + 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', + $ 'less than', F8.2, / ) + 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) + 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) + 9989 FORMAT( / 1X, A3, ' routines were not tested' ) + 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) +* +* End of ZCHKAA +* + END From 2c7d4a77664ca2657d0ff496fa100557a2813b06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:53:38 +0200 Subject: [PATCH 229/681] Delete cchkaa.f --- lapack-netlib/TESTING/LIN/cchkaa.f | 1220 ---------------------------- 1 file changed, 1220 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/cchkaa.f diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.f deleted file mode 100644 index d36770be7..000000000 --- a/lapack-netlib/TESTING/LIN/cchkaa.f +++ /dev/null @@ -1,1220 +0,0 @@ -*> \brief \b CCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM CCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> CCHKAA is the main test program for the COMPLEX linear equation -*> routines. -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 42 lines: -*> Data file for testing COMPLEX LAPACK linear equation routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 30.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> CGE 11 List types on next line if 0 < NTYPES < 11 -*> CGB 8 List types on next line if 0 < NTYPES < 8 -*> CGT 12 List types on next line if 0 < NTYPES < 12 -*> CPO 9 List types on next line if 0 < NTYPES < 9 -*> CPO 9 List types on next line if 0 < NTYPES < 9 -*> CPP 9 List types on next line if 0 < NTYPES < 9 -*> CPB 8 List types on next line if 0 < NTYPES < 8 -*> CPT 12 List types on next line if 0 < NTYPES < 12 -*> CHE 10 List types on next line if 0 < NTYPES < 10 -*> CHR 10 List types on next line if 0 < NTYPES < 10 -*> CHK 10 List types on next line if 0 < NTYPES < 10 -*> CHA 10 List types on next line if 0 < NTYPES < 10 -*> CH2 10 List types on next line if 0 < NTYPES < 10 -*> CSA 11 List types on next line if 0 < NTYPES < 10 -*> CS2 11 List types on next line if 0 < NTYPES < 10 -*> CHP 10 List types on next line if 0 < NTYPES < 10 -*> CSY 11 List types on next line if 0 < NTYPES < 11 -*> CSK 11 List types on next line if 0 < NTYPES < 11 -*> CSR 11 List types on next line if 0 < NTYPES < 11 -*> CSP 11 List types on next line if 0 < NTYPES < 11 -*> CTR 18 List types on next line if 0 < NTYPES < 18 -*> CTP 18 List types on next line if 0 < NTYPES < 18 -*> CTB 17 List types on next line if 0 < NTYPES < 17 -*> CQR 8 List types on next line if 0 < NTYPES < 8 -*> CRQ 8 List types on next line if 0 < NTYPES < 8 -*> CLQ 8 List types on next line if 0 < NTYPES < 8 -*> CQL 8 List types on next line if 0 < NTYPES < 8 -*> CQP 6 List types on next line if 0 < NTYPES < 6 -*> CTZ 3 List types on next line if 0 < NTYPES < 3 -*> CLS 6 List types on next line if 0 < NTYPES < 6 -*> CEQ -*> CQT -*> CQX -*> CTS -*> CHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup complex_lin -* -* ===================================================================== - PROGRAM CCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE, - $ CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP, - $ CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS, - $ CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ, - $ CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK, - $ CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR, CCHKTZ, - $ CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK, - $ CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB, - $ CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY, - $ CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER, - $ CCHKQRT, CCHKQRTP -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT -* .. -* .. Data statements .. - DATA THREQ / 2.0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = SECOND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) - NRHS = NSVAL( 1 ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Complex precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN -* -* HE: Hermitian indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN -* -* HR: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN -* -* HK: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than HR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN -* -* HA: Hermitian matrices, -* Aasen Algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN -* -* H2: Hermitian matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHE_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN -* -* HP: Hermitian indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices with Aasen's algorithm, -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* S2: symmetric indefinite matrices with Aasen's algorithm -* 2 stage -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL CDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL CCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL CCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ S( 1 ), S( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL CCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL CCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL CCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL CCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL CCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the COMPLEX LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of CCHKAA -* - END From 93cc066921f97b9d593b2c8fa258b54f34fb5510 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:54:13 +0200 Subject: [PATCH 230/681] Delete dchkaa.f --- lapack-netlib/TESTING/LIN/dchkaa.f | 1063 ---------------------------- 1 file changed, 1063 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/dchkaa.f diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.f deleted file mode 100644 index 03575c4d1..000000000 --- a/lapack-netlib/TESTING/LIN/dchkaa.f +++ /dev/null @@ -1,1063 +0,0 @@ -*> \brief \b DCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM DCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> DCHKAA is the main test program for the DOUBLE PRECISION LAPACK -*> linear equation routines -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 40 lines: -*> Data file for testing DOUBLE PRECISION LAPACK linear eqn. routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 20.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> DGE 11 List types on next line if 0 < NTYPES < 11 -*> DGB 8 List types on next line if 0 < NTYPES < 8 -*> DGT 12 List types on next line if 0 < NTYPES < 12 -*> DPO 9 List types on next line if 0 < NTYPES < 9 -*> DPS 9 List types on next line if 0 < NTYPES < 9 -*> DPP 9 List types on next line if 0 < NTYPES < 9 -*> DPB 8 List types on next line if 0 < NTYPES < 8 -*> DPT 12 List types on next line if 0 < NTYPES < 12 -*> DSY 10 List types on next line if 0 < NTYPES < 10 -*> DSR 10 List types on next line if 0 < NTYPES < 10 -*> DSK 10 List types on next line if 0 < NTYPES < 10 -*> DSA 10 List types on next line if 0 < NTYPES < 10 -*> DS2 10 List types on next line if 0 < NTYPES < 10 -*> DSP 10 List types on next line if 0 < NTYPES < 10 -*> DTR 18 List types on next line if 0 < NTYPES < 18 -*> DTP 18 List types on next line if 0 < NTYPES < 18 -*> DTB 17 List types on next line if 0 < NTYPES < 17 -*> DQR 8 List types on next line if 0 < NTYPES < 8 -*> DRQ 8 List types on next line if 0 < NTYPES < 8 -*> DLQ 8 List types on next line if 0 < NTYPES < 8 -*> DQL 8 List types on next line if 0 < NTYPES < 8 -*> DQP 6 List types on next line if 0 < NTYPES < 6 -*> DTZ 3 List types on next line if 0 < NTYPES < 3 -*> DLS 6 List types on next line if 0 < NTYPES < 6 -*> DEQ -*> DQT -*> DQX -*> DTQ -*> DXQ -*> DTS -*> DHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup double_lin -* -* ===================================================================== - PROGRAM DCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* Novemebr 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, 3*NMAX+MAXRHS+30 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ, - $ DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP, - $ DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP, - $ DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, - $ DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE, - $ DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT, - $ DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK, - $ DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP, - $ DCHKLQT,DCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = DSECND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) THEN - NMATS = MATMAX - GO TO 130 - END IF - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) - NRHS = NSVAL( 1 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Double precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL DDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL DCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL DDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ RWORK, RWORK( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL DCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL DCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL DCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL DCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL DCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL DCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE - -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the DOUBLE PRECISION LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of DCHKAA -* - END From f7bcd962c19ec997514ec65f0222713405ac6dea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:54:54 +0200 Subject: [PATCH 231/681] Delete schkaa.f --- lapack-netlib/TESTING/LIN/schkaa.f | 1058 ---------------------------- 1 file changed, 1058 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/schkaa.f diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.f deleted file mode 100644 index a9c13e442..000000000 --- a/lapack-netlib/TESTING/LIN/schkaa.f +++ /dev/null @@ -1,1058 +0,0 @@ -*> \brief \b SCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM SCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> SCHKAA is the main test program for the REAL LAPACK -*> linear equation routines -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 40 lines: -*> Data file for testing REAL LAPACK linear eqn. routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 20.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> SGE 11 List types on next line if 0 < NTYPES < 11 -*> SGB 8 List types on next line if 0 < NTYPES < 8 -*> SGT 12 List types on next line if 0 < NTYPES < 12 -*> SPO 9 List types on next line if 0 < NTYPES < 9 -*> SPS 9 List types on next line if 0 < NTYPES < 9 -*> SPP 9 List types on next line if 0 < NTYPES < 9 -*> SPB 8 List types on next line if 0 < NTYPES < 8 -*> SPT 12 List types on next line if 0 < NTYPES < 12 -*> SSY 10 List types on next line if 0 < NTYPES < 10 -*> SSR 10 List types on next line if 0 < NTYPES < 10 -*> SSK 10 List types on next line if 0 < NTYPES < 10 -*> SSA 10 List types on next line if 0 < NTYPES < 10 -*> SS2 10 List types on next line if 0 < NTYPES < 10 -*> SSP 10 List types on next line if 0 < NTYPES < 10 -*> STR 18 List types on next line if 0 < NTYPES < 18 -*> STP 18 List types on next line if 0 < NTYPES < 18 -*> STB 17 List types on next line if 0 < NTYPES < 17 -*> SQR 8 List types on next line if 0 < NTYPES < 8 -*> SRQ 8 List types on next line if 0 < NTYPES < 8 -*> SLQ 8 List types on next line if 0 < NTYPES < 8 -*> SQL 8 List types on next line if 0 < NTYPES < 8 -*> SQP 6 List types on next line if 0 < NTYPES < 6 -*> STZ 3 List types on next line if 0 < NTYPES < 3 -*> SLS 6 List types on next line if 0 < NTYPES < 6 -*> SEQ -*> SQT -*> SQX -*> STS -*> SHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup single_lin -* -* ===================================================================== - PROGRAM SCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - REAL EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, NMAX+MAXRHS+30 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - REAL SECOND, SLAMCH - EXTERNAL LSAME, LSAMEN, SECOND, SLAMCH -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ, - $ SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP, - $ SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP, - $ SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, - $ SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, - $ SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, - $ SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK, - $ SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP, - $ SCHKLQT, SCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / CLAENV / IPARMS - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT -* .. -* .. Data statements .. - DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = SECOND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = SLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = SLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = SLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) THEN - NMATS = MATMAX - GO TO 130 - END IF - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) - NRHS = NSVAL( 1 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Single precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* SA: symmetric indefinite matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL SDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL SCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 3 ), WORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL SDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ RWORK, RWORK( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL SCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL SCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL SCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL SCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL SCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL SCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = SECOND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the REAL LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', E16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of SCHKAA -* - END From 15b9d6b4a70aed9e7010aea7009f14f1098e11c1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 19:55:31 +0200 Subject: [PATCH 232/681] Delete zchkaa.f --- lapack-netlib/TESTING/LIN/zchkaa.f | 1255 ---------------------------- 1 file changed, 1255 deletions(-) delete mode 100644 lapack-netlib/TESTING/LIN/zchkaa.f diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.f deleted file mode 100644 index 30d2a084a..000000000 --- a/lapack-netlib/TESTING/LIN/zchkaa.f +++ /dev/null @@ -1,1255 +0,0 @@ -*> \brief \b ZCHKAA -* -* =========== DOCUMENTATION =========== -* -* Online html documentation available at -* http://www.netlib.org/lapack/explore-html/ -* -* Definition: -* =========== -* -* PROGRAM ZCHKAA -* -* -*> \par Purpose: -* ============= -*> -*> \verbatim -*> -*> ZCHKAA is the main test program for the COMPLEX*16 linear equation -*> routines. -*> -*> The program must be driven by a short data file. The first 15 records -*> (not including the first comment line) specify problem dimensions -*> and program options using list-directed input. The remaining lines -*> specify the LAPACK test paths and the number of matrix types to use -*> in testing. An annotated example of a data file can be obtained by -*> deleting the first 3 characters from the following 42 lines: -*> Data file for testing COMPLEX*16 LAPACK linear equation routines -*> 7 Number of values of M -*> 0 1 2 3 5 10 16 Values of M (row dimension) -*> 7 Number of values of N -*> 0 1 2 3 5 10 16 Values of N (column dimension) -*> 1 Number of values of NRHS -*> 2 Values of NRHS (number of right hand sides) -*> 5 Number of values of NB -*> 1 3 3 3 20 Values of NB (the blocksize) -*> 1 0 5 9 1 Values of NX (crossover point) -*> 3 Number of values of RANK -*> 30 50 90 Values of rank (as a % of N) -*> 30.0 Threshold value of test ratio -*> T Put T to test the LAPACK routines -*> T Put T to test the driver routines -*> T Put T to test the error exits -*> ZGE 11 List types on next line if 0 < NTYPES < 11 -*> ZGB 8 List types on next line if 0 < NTYPES < 8 -*> ZGT 12 List types on next line if 0 < NTYPES < 12 -*> ZPO 9 List types on next line if 0 < NTYPES < 9 -*> ZPS 9 List types on next line if 0 < NTYPES < 9 -*> ZPP 9 List types on next line if 0 < NTYPES < 9 -*> ZPB 8 List types on next line if 0 < NTYPES < 8 -*> ZPT 12 List types on next line if 0 < NTYPES < 12 -*> ZHE 10 List types on next line if 0 < NTYPES < 10 -*> ZHR 10 List types on next line if 0 < NTYPES < 10 -*> ZHK 10 List types on next line if 0 < NTYPES < 10 -*> ZHA 10 List types on next line if 0 < NTYPES < 10 -*> ZH2 10 List types on next line if 0 < NTYPES < 10 -*> ZSA 11 List types on next line if 0 < NTYPES < 10 -*> ZS2 11 List types on next line if 0 < NTYPES < 10 -*> ZHP 10 List types on next line if 0 < NTYPES < 10 -*> ZSY 11 List types on next line if 0 < NTYPES < 11 -*> ZSR 11 List types on next line if 0 < NTYPES < 11 -*> ZSK 11 List types on next line if 0 < NTYPES < 11 -*> ZSP 11 List types on next line if 0 < NTYPES < 11 -*> ZTR 18 List types on next line if 0 < NTYPES < 18 -*> ZTP 18 List types on next line if 0 < NTYPES < 18 -*> ZTB 17 List types on next line if 0 < NTYPES < 17 -*> ZQR 8 List types on next line if 0 < NTYPES < 8 -*> ZRQ 8 List types on next line if 0 < NTYPES < 8 -*> ZLQ 8 List types on next line if 0 < NTYPES < 8 -*> ZQL 8 List types on next line if 0 < NTYPES < 8 -*> ZQP 6 List types on next line if 0 < NTYPES < 6 -*> ZTZ 3 List types on next line if 0 < NTYPES < 3 -*> ZLS 6 List types on next line if 0 < NTYPES < 6 -*> ZEQ -*> ZQT -*> ZQX -*> ZTS -*> ZHH -*> \endverbatim -* -* Parameters: -* ========== -* -*> \verbatim -*> NMAX INTEGER -*> The maximum allowable value for M and N. -*> -*> MAXIN INTEGER -*> The number of different values that can be used for each of -*> M, N, NRHS, NB, NX and RANK -*> -*> MAXRHS INTEGER -*> The maximum number of right hand sides -*> -*> MATMAX INTEGER -*> The maximum number of matrix types to use for testing -*> -*> NIN INTEGER -*> The unit number for input -*> -*> NOUT INTEGER -*> The unit number for output -*> \endverbatim -* -* Authors: -* ======== -* -*> \author Univ. of Tennessee -*> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver -*> \author NAG Ltd. -* -*> \date November 2019 -* -*> \ingroup complex16_lin -* -* ===================================================================== - PROGRAM ZCHKAA -* -* -- LAPACK test routine (version 3.9.0) -- -* -- LAPACK is a software package provided by Univ. of Tennessee, -- -* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 -* -* ===================================================================== -* -* .. Parameters .. - INTEGER NMAX - PARAMETER ( NMAX = 132 ) - INTEGER MAXIN - PARAMETER ( MAXIN = 12 ) - INTEGER MAXRHS - PARAMETER ( MAXRHS = 16 ) - INTEGER MATMAX - PARAMETER ( MATMAX = 30 ) - INTEGER NIN, NOUT - PARAMETER ( NIN = 5, NOUT = 6 ) - INTEGER KDMAX - PARAMETER ( KDMAX = NMAX+( NMAX+1 ) / 4 ) -* .. -* .. Local Scalars .. - LOGICAL FATAL, TSTCHK, TSTDRV, TSTERR - CHARACTER C1 - CHARACTER*2 C2 - CHARACTER*3 PATH - CHARACTER*10 INTSTR - CHARACTER*72 ALINE - INTEGER I, IC, J, K, LA, LAFAC, LDA, NB, NM, NMATS, NN, - $ NNB, NNB2, NNS, NRHS, NTYPES, NRANK, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - DOUBLE PRECISION EPS, S1, S2, THREQ, THRESH -* .. -* .. Local Arrays .. - LOGICAL DOTYPE( MATMAX ) - INTEGER IWORK( 25*NMAX ), MVAL( MAXIN ), - $ NBVAL( MAXIN ), NBVAL2( MAXIN ), - $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), - $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX*16 A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) -* .. -* .. External Functions .. - LOGICAL LSAME, LSAMEN - DOUBLE PRECISION DLAMCH, DSECND - EXTERNAL LSAME, LSAMEN, DLAMCH, DSECND -* .. -* .. External Subroutines .. - EXTERNAL ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE, - $ ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP, - $ ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS, - $ ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, - $ ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK, - $ ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ, - $ ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK, - $ ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP, - $ ZDRVLS, ZDRVPB, ZDRVPO, ZDRVPP, ZDRVPT, - $ ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK, - $ ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, - $ ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR -* .. -* .. Scalars in Common .. - LOGICAL LERR, OK - CHARACTER*32 SRNAMT - INTEGER INFOT, NUNIT -* .. -* .. Arrays in Common .. - INTEGER IPARMS( 100 ) -* .. -* .. Common blocks .. - COMMON / INFOC / INFOT, NUNIT, OK, LERR - COMMON / SRNAMC / SRNAMT - COMMON / CLAENV / IPARMS -* .. -* .. Data statements .. - DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / -* .. -* .. Executable Statements .. -* - S1 = DSECND( ) - LDA = NMAX - FATAL = .FALSE. -* -* Read a dummy line. -* - READ( NIN, FMT = * ) -* -* Report values of parameters. -* - CALL ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH ) - WRITE( NOUT, FMT = 9994 ) VERS_MAJOR, VERS_MINOR, VERS_PATCH -* -* Read the values of M -* - READ( NIN, FMT = * )NM - IF( NM.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NM ', NM, 1 - NM = 0 - FATAL = .TRUE. - ELSE IF( NM.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NM ', NM, MAXIN - NM = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( MVAL( I ), I = 1, NM ) - DO 10 I = 1, NM - IF( MVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' M ', MVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( MVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' M ', MVAL( I ), NMAX - FATAL = .TRUE. - END IF - 10 CONTINUE - IF( NM.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'M ', ( MVAL( I ), I = 1, NM ) -* -* Read the values of N -* - READ( NIN, FMT = * )NN - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NN ', NN, 1 - NN = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NN ', NN, MAXIN - NN = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NVAL( I ), I = 1, NN ) - DO 20 I = 1, NN - IF( NVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' N ', NVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NVAL( I ).GT.NMAX ) THEN - WRITE( NOUT, FMT = 9995 )' N ', NVAL( I ), NMAX - FATAL = .TRUE. - END IF - 20 CONTINUE - IF( NN.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'N ', ( NVAL( I ), I = 1, NN ) -* -* Read the values of NRHS -* - READ( NIN, FMT = * )NNS - IF( NNS.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NNS', NNS, 1 - NNS = 0 - FATAL = .TRUE. - ELSE IF( NNS.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NNS', NNS, MAXIN - NNS = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NSVAL( I ), I = 1, NNS ) - DO 30 I = 1, NNS - IF( NSVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )'NRHS', NSVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( NSVAL( I ).GT.MAXRHS ) THEN - WRITE( NOUT, FMT = 9995 )'NRHS', NSVAL( I ), MAXRHS - FATAL = .TRUE. - END IF - 30 CONTINUE - IF( NNS.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NRHS', ( NSVAL( I ), I = 1, NNS ) -* -* Read the values of NB -* - READ( NIN, FMT = * )NNB - IF( NNB.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )'NNB ', NNB, 1 - NNB = 0 - FATAL = .TRUE. - ELSE IF( NNB.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )'NNB ', NNB, MAXIN - NNB = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) - DO 40 I = 1, NNB - IF( NBVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NB ', NBVAL( I ), 0 - FATAL = .TRUE. - END IF - 40 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NB ', ( NBVAL( I ), I = 1, NNB ) -* -* Set NBVAL2 to be the set of unique values of NB -* - NNB2 = 0 - DO 60 I = 1, NNB - NB = NBVAL( I ) - DO 50 J = 1, NNB2 - IF( NB.EQ.NBVAL2( J ) ) - $ GO TO 60 - 50 CONTINUE - NNB2 = NNB2 + 1 - NBVAL2( NNB2 ) = NB - 60 CONTINUE -* -* Read the values of NX -* - READ( NIN, FMT = * )( NXVAL( I ), I = 1, NNB ) - DO 70 I = 1, NNB - IF( NXVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' NX ', NXVAL( I ), 0 - FATAL = .TRUE. - END IF - 70 CONTINUE - IF( NNB.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'NX ', ( NXVAL( I ), I = 1, NNB ) -* -* Read the values of RANKVAL -* - READ( NIN, FMT = * )NRANK - IF( NN.LT.1 ) THEN - WRITE( NOUT, FMT = 9996 )' NRANK ', NRANK, 1 - NRANK = 0 - FATAL = .TRUE. - ELSE IF( NN.GT.MAXIN ) THEN - WRITE( NOUT, FMT = 9995 )' NRANK ', NRANK, MAXIN - NRANK = 0 - FATAL = .TRUE. - END IF - READ( NIN, FMT = * )( RANKVAL( I ), I = 1, NRANK ) - DO I = 1, NRANK - IF( RANKVAL( I ).LT.0 ) THEN - WRITE( NOUT, FMT = 9996 )' RANK ', RANKVAL( I ), 0 - FATAL = .TRUE. - ELSE IF( RANKVAL( I ).GT.100 ) THEN - WRITE( NOUT, FMT = 9995 )' RANK ', RANKVAL( I ), 100 - FATAL = .TRUE. - END IF - END DO - IF( NRANK.GT.0 ) - $ WRITE( NOUT, FMT = 9993 )'RANK % OF N', - $ ( RANKVAL( I ), I = 1, NRANK ) -* -* Read the threshold value for the test ratios. -* - READ( NIN, FMT = * )THRESH - WRITE( NOUT, FMT = 9992 )THRESH -* -* Read the flag that indicates whether to test the LAPACK routines. -* - READ( NIN, FMT = * )TSTCHK -* -* Read the flag that indicates whether to test the driver routines. -* - READ( NIN, FMT = * )TSTDRV -* -* Read the flag that indicates whether to test the error exits. -* - READ( NIN, FMT = * )TSTERR -* - IF( FATAL ) THEN - WRITE( NOUT, FMT = 9999 ) - STOP - END IF -* -* Calculate and print the machine dependent constants. -* - EPS = DLAMCH( 'Underflow threshold' ) - WRITE( NOUT, FMT = 9991 )'underflow', EPS - EPS = DLAMCH( 'Overflow threshold' ) - WRITE( NOUT, FMT = 9991 )'overflow ', EPS - EPS = DLAMCH( 'Epsilon' ) - WRITE( NOUT, FMT = 9991 )'precision', EPS - WRITE( NOUT, FMT = * ) - NRHS = NSVAL( 1 ) -* - 80 CONTINUE -* -* Read a test path and the number of matrix types to use. -* - READ( NIN, FMT = '(A72)', END = 140 )ALINE - PATH = ALINE( 1: 3 ) - NMATS = MATMAX - I = 3 - 90 CONTINUE - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - IF( ALINE( I: I ).EQ.' ' ) - $ GO TO 90 - NMATS = 0 - 100 CONTINUE - C1 = ALINE( I: I ) - DO 110 K = 1, 10 - IF( C1.EQ.INTSTR( K: K ) ) THEN - IC = K - 1 - GO TO 120 - END IF - 110 CONTINUE - GO TO 130 - 120 CONTINUE - NMATS = NMATS*10 + IC - I = I + 1 - IF( I.GT.72 ) - $ GO TO 130 - GO TO 100 - 130 CONTINUE - C1 = PATH( 1: 1 ) - C2 = PATH( 2: 3 ) -* -* Check first character for correct precision. -* - IF( .NOT.LSAME( C1, 'Zomplex precision' ) ) THEN - WRITE( NOUT, FMT = 9990 )PATH -* - ELSE IF( NMATS.LE.0 ) THEN -* -* Check for a positive number of tests requested. -* - WRITE( NOUT, FMT = 9989 )PATH -* - ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN -* -* GE: general matrices -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGE( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GB' ) ) THEN -* -* GB: general banded matrices -* - LA = ( 2*KDMAX+1 )*NMAX - LAFAC = ( 3*KDMAX+1 )*NMAX - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGB( DOTYPE, NM, MVAL, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, A( 1, 1 ), LA, - $ A( 1, 3 ), LAFAC, B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), LA, A( 1, 3 ), LAFAC, A( 1, 6 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'GT' ) ) THEN -* -* GT: general tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKGT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVGT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PO' ) ) THEN -* -* PO: positive definite matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPO( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPO( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PS' ) ) THEN -* -* PS: positive semi-definite matrices -* - NTYPES = 9 -* - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPS( DOTYPE, NN, NVAL, NNB2, NBVAL2, NRANK, - $ RANKVAL, THRESH, TSTERR, LDA, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), PIV, WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PP' ) ) THEN -* -* PP: positive definite packed matrices -* - NTYPES = 9 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PB' ) ) THEN -* -* PB: positive definite banded matrices -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPB( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPB( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), S, WORK, - $ RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'PT' ) ) THEN -* -* PT: positive definite tridiagonal matrices -* - NTYPES = 12 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKPT( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVPT( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ A( 1, 1 ), S, A( 1, 2 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HE' ) ) THEN -* -* HE: Hermitian indefinite matrices -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF - - ELSE IF( LSAMEN( 2, C2, 'HR' ) ) THEN -* -* HR: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HK' ) ) THEN -* -* HK: Hermitian indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than HR path version. -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_RK ( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN -* -* HA: Hermitian matrices, -* Aasen Algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN -* -* H2: Hermitian matrices, -* with partial (Aasen's) pivoting algorithm -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, - $ NNS, NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHE_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN -* -* HP: Hermitian indefinite packed matrices -* - NTYPES = 10 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKHP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVHP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SY' ) ) THEN -* -* SY: symmetric indefinite matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SR' ) ) THEN -* -* SR: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_ROOK(DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_ROOK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SK' ) ) THEN -* -* SK: symmetric indefinite matrices, -* with bounded Bunch-Kaufman (rook) pivoting algorithm, -* different matrix storage format than SR path version. -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_RK( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ E, A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_RK( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), E, A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN -* -* SA: symmetric indefinite matrices with Aasen's algorithm, -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_AA( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ A( 1, 3 ), B( 1, 1 ), B( 1, 2 ), - $ B( 1, 3 ), WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_AA( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN -* -* S2: symmetric indefinite matrices with Aasen's algorithm -* 2 stage -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSY_AA_2STAGE( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, - $ NSVAL, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSY_AA_2STAGE( - $ DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, - $ RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN -* -* SP: symmetric indefinite packed matrices, -* with partial (Bunch-Kaufman) pivoting algorithm -* - NTYPES = 11 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKSP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - IF( TSTDRV ) THEN - CALL ZDRVSP( DOTYPE, NN, NVAL, NRHS, THRESH, TSTERR, LDA, - $ A( 1, 1 ), A( 1, 2 ), A( 1, 3 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9988 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TR' ) ) THEN -* -* TR: triangular matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTR( DOTYPE, NN, NVAL, NNB2, NBVAL2, NNS, NSVAL, - $ THRESH, TSTERR, LDA, A( 1, 1 ), A( 1, 2 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), WORK, RWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN -* -* TP: triangular packed matrices -* - NTYPES = 18 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTP( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TB' ) ) THEN -* -* TB: triangular banded matrices -* - NTYPES = 17 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTB( DOTYPE, NN, NVAL, NNS, NSVAL, THRESH, TSTERR, - $ LDA, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ), - $ B( 1, 2 ), B( 1, 3 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QR' ) ) THEN -* -* QR: QR factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQR( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LQ' ) ) THEN -* -* LQ: LQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKLQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QL' ) ) THEN -* -* QL: QL factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQL( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'RQ' ) ) THEN -* -* RQ: RQ factorization -* - NTYPES = 8 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKRQ( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ NRHS, THRESH, TSTERR, NMAX, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ B( 1, 1 ), B( 1, 2 ), B( 1, 3 ), B( 1, 4 ), - $ WORK, RWORK, IWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'EQ' ) ) THEN -* -* EQ: Equilibration routines for general and positive definite -* matrices (THREQ should be between 2 and 10) -* - IF( TSTCHK ) THEN - CALL ZCHKEQ( THREQ, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN -* -* TZ: Trapezoidal matrix -* - NTYPES = 3 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKTZ( DOTYPE, NM, MVAL, NN, NVAL, THRESH, TSTERR, - $ A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QP' ) ) THEN -* -* QP: QR factorization with pivoting -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTCHK ) THEN - CALL ZCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL, - $ THRESH, A( 1, 1 ), A( 1, 2 ), S( 1 ), - $ B( 1, 1 ), WORK, RWORK, IWORK, - $ NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN -* -* LS: Least squares drivers -* - NTYPES = 6 - CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) -* - IF( TSTDRV ) THEN - CALL ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, - $ NBVAL, NXVAL, THRESH, TSTERR, A( 1, 1 ), - $ A( 1, 2 ), A( 1, 3 ), A( 1, 4 ), A( 1, 5 ), - $ S( 1 ), S( NMAX+1 ), NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* -* - ELSE IF( LSAMEN( 2, C2, 'QT' ) ) THEN -* -* QT: QRT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKQRT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'QX' ) ) THEN -* -* QX: QRT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKQRTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TQ' ) ) THEN -* -* TQ: LQT routines for general matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQT( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'XQ' ) ) THEN -* -* XQ: LQT routines for triangular-pentagonal matrices -* - IF( TSTCHK ) THEN - CALL ZCHKLQTP( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'TS' ) ) THEN -* -* TS: QR routines for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKTSQR( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 )PATH - END IF -* - ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN -* -* HH: Householder reconstruction for tall-skinny matrices -* - IF( TSTCHK ) THEN - CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) - ELSE - WRITE( NOUT, FMT = 9989 ) PATH - END IF -* - ELSE -* - WRITE( NOUT, FMT = 9990 )PATH - END IF -* -* Go back to get another input line. -* - GO TO 80 -* -* Branch to this line when the last record is read. -* - 140 CONTINUE - CLOSE ( NIN ) - S2 = DSECND( ) - WRITE( NOUT, FMT = 9998 ) - WRITE( NOUT, FMT = 9997 )S2 - S1 -* - 9999 FORMAT( / ' Execution not attempted due to input errors' ) - 9998 FORMAT( / ' End of tests' ) - 9997 FORMAT( ' Total time used = ', F12.2, ' seconds', / ) - 9996 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be >=', - $ I6 ) - 9995 FORMAT( ' Invalid input value: ', A4, '=', I6, '; must be <=', - $ I6 ) - 9994 FORMAT( ' Tests of the COMPLEX*16 LAPACK routines ', - $ / ' LAPACK VERSION ', I1, '.', I1, '.', I1, - $ / / ' The following parameter values will be used:' ) - 9993 FORMAT( 4X, A4, ': ', 10I6, / 11X, 10I6 ) - 9992 FORMAT( / ' Routines pass computational tests if test ratio is ', - $ 'less than', F8.2, / ) - 9991 FORMAT( ' Relative machine ', A, ' is taken to be', D16.6 ) - 9990 FORMAT( / 1X, A3, ': Unrecognized path name' ) - 9989 FORMAT( / 1X, A3, ' routines were not tested' ) - 9988 FORMAT( / 1X, A3, ' driver routines were not tested' ) -* -* End of ZCHKAA -* - END From 26e87ac517edd08ac8da373e6cba4584d65479a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 20:39:55 +0200 Subject: [PATCH 233/681] Support Intel Ice Lake SP as Cooper Lake --- cpuid_x86.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 44704fcd9..18ff122e5 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1398,6 +1398,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 10: // Ice Lake SP + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; case 7: // family 6 exmodel 7 @@ -2112,7 +2123,22 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; -#endif +#endif + if (model == 10) +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif break; case 7: if (model == 10) From cbfd3c87e17f9a3123e25802d07613842f325ca2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 20:44:06 +0200 Subject: [PATCH 234/681] Recognize Intel Ice Lake SP as Cooper Lake --- driver/others/dynamic.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 158e1b3da..46ad06a7c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -621,6 +621,22 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } + if (model == 10) { + // Ice Lake SP + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } return NULL; case 7: if (model == 10) // Goldmont Plus From c4da892ba0798f8697e7b3219fd631651647e45f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 May 2021 23:19:10 +0200 Subject: [PATCH 235/681] Only filter out -mavx on Sandybridge ZGEMM/ZTRMM kernels --- kernel/Makefile.L3 | 86 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 18 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index be10ee018..2d9e3ec36 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -818,8 +818,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -828,8 +830,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -838,8 +842,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) @@ -848,8 +854,10 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s -else +else ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) @@ -1044,8 +1052,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1054,8 +1064,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1064,8 +1076,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1074,8 +1088,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1084,8 +1100,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1094,8 +1112,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1104,8 +1124,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) @@ -1114,8 +1136,10 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s -else +else ifeq ($(CORE), SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif else @@ -1187,28 +1211,54 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif endif From 32264ba496e49f774a4fe3d63ff06cac6d37ef62 Mon Sep 17 00:00:00 2001 From: Noan <66834344+dnoan@users.noreply.github.com> Date: Sun, 16 May 2021 09:49:13 +0000 Subject: [PATCH 236/681] Update Makefile.arm64 Added -march and -mtune flags for EMAG processors when GCC 9 or later --- Makefile.arm64 | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index 23362b4e5..3858d7e3f 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -107,4 +107,13 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif endif + +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq ($(CORE), EMAG8180) +CCOMMON_OPT += -march=armv8-a -mtune=emag +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=emag +endif +endif +endif endif From 26ccf643a38ef501981b3dc629a78f3ed4bdd39f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 May 2021 13:04:38 +0200 Subject: [PATCH 237/681] Add -lm for FreeBSD on ARM/ARM64 --- Makefile.system | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index ae703e4d9..bffe684d7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -380,6 +380,12 @@ ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif +ifeq ($(OSNAME), FreeBSD) +ifeq ($(ARCH), $(filter ($ARCH),arm arm64)) +EXTRALIB += -lm +endif +endif + ifeq ($(OSNAME), WINNT) NEED_PIC = 0 NO_EXPRECISION = 1 From 5c729c6dce38bda7c870325bc0fcd035ae65f1bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 May 2021 14:47:14 +0200 Subject: [PATCH 238/681] Correct function name in error message from SLASQ2 (Reference-LAPACK PR 555) --- lapack-netlib/SRC/slasq2.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f index 219797c4a..c0c71b82e 100644 --- a/lapack-netlib/SRC/slasq2.f +++ b/lapack-netlib/SRC/slasq2.f @@ -185,7 +185,7 @@ * IF( Z( 1 ).LT.ZERO ) THEN INFO = -201 - CALL XERBLA( 'DLASQ2', 2 ) + CALL XERBLA( 'SLASQ2', 2 ) RETURN ELSE IF( Z( 2 ).LT.ZERO ) THEN INFO = -202 From 0e73d206297f5e419647f4d579da8a93e9b730dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 14:23:49 +0200 Subject: [PATCH 239/681] Handle inadvertent use of DYNAMIC_ARCH=0 --- Makefile.x86 | 2 +- Makefile.x86_64 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.x86 b/Makefile.x86 index 893379c33..25ca660bd 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,6 +1,6 @@ # COMPILER_PREFIX = mingw32- -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f62ab9e5e..307cbe1d9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,7 +9,7 @@ endif endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH),1) ADD_CPUFLAGS = 1 else ifdef TARGET_CORE From 3a53207cc9f5907c257359ef37dc3c0df3f62ac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 14:29:45 +0200 Subject: [PATCH 240/681] Fix spurious error exit test failures in the ?chktsqr tests (LAPACK564) --- lapack-netlib/TESTING/LIN/cchktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/dchktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/schktsqr.f | 2 ++ lapack-netlib/TESTING/LIN/zchktsqr.f | 2 ++ 4 files changed, 8 insertions(+) diff --git a/lapack-netlib/TESTING/LIN/cchktsqr.f b/lapack-netlib/TESTING/LIN/cchktsqr.f index 8288916db..62b6ce434 100644 --- a/lapack-netlib/TESTING/LIN/cchktsqr.f +++ b/lapack-netlib/TESTING/LIN/cchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL CERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/dchktsqr.f b/lapack-netlib/TESTING/LIN/dchktsqr.f index c4b1f01bd..14119e6e5 100644 --- a/lapack-netlib/TESTING/LIN/dchktsqr.f +++ b/lapack-netlib/TESTING/LIN/dchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL DERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/schktsqr.f b/lapack-netlib/TESTING/LIN/schktsqr.f index 2bed434a8..aa4d6f9c4 100644 --- a/lapack-netlib/TESTING/LIN/schktsqr.f +++ b/lapack-netlib/TESTING/LIN/schktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL SERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/zchktsqr.f b/lapack-netlib/TESTING/LIN/zchktsqr.f index e6e6ac556..678b1772f 100644 --- a/lapack-netlib/TESTING/LIN/zchktsqr.f +++ b/lapack-netlib/TESTING/LIN/zchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL ZERRTSQR( PATH, NOUT ) INFOT = 0 * From 03297ff9f08d8fe42e4ef93f6f54bd82c6a9f6fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 May 2021 20:41:18 +0200 Subject: [PATCH 241/681] Add fast path for small xSYR with INCX==1 --- interface/syr.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/interface/syr.c b/interface/syr.c index 1374bcc69..ad75264b1 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; FUNCTION_PROFILE_START(); - +#if 1 + if (incx == 1 && n < 100) { + BLASLONG i; + + if (uplo == 0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; + } + } + return; + } +#endif if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); From 4fbc0777f434aa4e7b8e91279ca0e6bc54ea63a8 Mon Sep 17 00:00:00 2001 From: MikaelUrankar Date: Wed, 26 May 2021 12:14:57 +0200 Subject: [PATCH 242/681] Fix typo --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index bffe684d7..2264b143b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -381,7 +381,7 @@ EXTRALIB += -lm endif ifeq ($(OSNAME), FreeBSD) -ifeq ($(ARCH), $(filter ($ARCH),arm arm64)) +ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) EXTRALIB += -lm endif endif From f0e7345fb8513afea09a7b848508f4800a225a9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 May 2021 22:02:34 +0200 Subject: [PATCH 243/681] Add shortcut for small-size gemv_n with increments of one --- interface/gemv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index d5d739fb1..b6c2e6095 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -202,6 +202,11 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From d6d7a6685dc5189b43519bb0d5a5fba52b4b0955 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 May 2021 22:39:18 +0200 Subject: [PATCH 244/681] Add shortcuts for (small) cases that do not need expensive buffer allocation --- interface/ger.c | 5 +++++ interface/spr.c | 20 ++++++++++++++++++++ interface/spr2.c | 18 ++++++++++++++++++ interface/symv.c | 4 ++++ interface/syr2.c | 19 +++++++++++++++++++ interface/zsyr.c | 26 ++++++++++++++++++++++++++ 6 files changed, 92 insertions(+) diff --git a/interface/ger.c b/interface/ger.c index 8cf1614e3..1c72d51ec 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order, if (m == 0 || n == 0) return; if (alpha == 0.) return; + if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/spr.c b/interface/spr.c index 1956986e9..8aafc9f85 100644 --- a/interface/spr.c +++ b/interface/spr.c @@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order, FUNCTION_PROFILE_START(); + if (incx == 1 && n <100) { + blasint i; + if (uplo==0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += n - i; + } + } + return; + } + if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/spr2.c b/interface/spr2.c index 73a811c3e..b5aab1767 100644 --- a/interface/spr2.c +++ b/interface/spr2.c @@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += n - i; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/symv.c b/interface/symv.c index 07bd20022..de2b91ee4 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -170,6 +170,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); + return; + } IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/syr2.c b/interface/syr2.c index 08fd47e57..632906d28 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; + if (incx == 1 && incy == 1 && n < 100) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += lda; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += 1 + lda; + } + } + return; + } + + FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; diff --git a/interface/zsyr.c b/interface/zsyr.c index 09b1de578..b68237c93 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 1217eb910d2da2e8ce47ef62fd3543c6345a3923 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 28 May 2021 09:38:48 +0200 Subject: [PATCH 245/681] Fix copy-paste errors in variables used --- interface/zsyr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index b68237c93..71d4dbf29 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -172,7 +172,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; - if (incx == 1 && incy == 1 && n < 50) { + if (incx == 1 && n < 50) { blasint i; if (!uplo) { for (i = 0; i < n; i++){ @@ -187,7 +187,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO } else { for (i = 0; i < n; i++){ if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { - AXPYU_K(m - i, 0, 0, + AXPYU_K(n - i, 0, 0, alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], x + i * 2, 1, a, 1, NULL, 0); From 734bd265a8b1c80f8fc078ad93fad817bdc9c08e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 May 2021 15:40:03 +0200 Subject: [PATCH 246/681] revert symv changes for now --- interface/symv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/interface/symv.c b/interface/symv.c index de2b91ee4..07bd20022 100644 --- a/interface/symv.c +++ b/interface/symv.c @@ -170,10 +170,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, if (alpha == ZERO) return; - if (incx == 1 && incy == 1 && n*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); - return; - } IDEBUG_START; FUNCTION_PROFILE_START(); From f84197c1a731889495f282be1d7089deedc83081 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 29 May 2021 22:28:00 +0200 Subject: [PATCH 247/681] Add shortcuts for (small) cases that do not need expensive buffer allocation --- interface/trsv.c | 6 ++++++ interface/ztrsv.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/interface/trsv.c b/interface/trsv.c index a054d8eeb..6a6e8f8ba 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ztrsv.c b/interface/ztrsv.c index cbb7bba13..cf750b0b0 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; + if (incx == 1 && trans == 0 && n < 50) { + buffer = NULL; + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); From 8c25b440a05d549f40b6a8af68288cf8aa7869f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Jun 2021 19:17:36 +0200 Subject: [PATCH 248/681] revert "try to work around gcc update problems" ...as homebrew has dropped at least gcc8 now --- .github/workflows/nightly-Homebrew-build.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index b025f8634..29ec96f73 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -43,11 +43,6 @@ jobs: - name: Update Homebrew if: github.event_name != 'pull_request' run: brew update || true - - - name: unlink installed gcc to allow updating - run: | - brew unlink gcc@8 - brew unlink gcc@9 - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas From 1e0192a5ccac28fc0c749f49d36ec7eda9757428 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:49:39 +0000 Subject: [PATCH 249/681] riscv64/imin: Fix wrong comparison Same as #1990. --- kernel/riscv64/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/riscv64/imin.c +++ b/kernel/riscv64/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 3521cd48cbfb3d50f6ae9a10377382d37075c696 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:50:23 +0000 Subject: [PATCH 250/681] RISCV64_GENERIC: Use generic kernel for DSDOT for better precision The implementation in `riscv64/dot.c` fails the `test_dsdot` test, and the generic kernel seems to have better precision. Tested on SiFive FU740 (HiFive Unmatched) and QEMU. Also see #1469. --- kernel/riscv64/KERNEL.RISCV64_GENERIC | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index ea6a8cf21..61a8a2b91 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -54,6 +54,7 @@ SDOTKERNEL = ../riscv64/dot.c DDOTKERNEL = ../riscv64/dot.c CDOTKERNEL = ../riscv64/zdot.c ZDOTKERNEL = ../riscv64/zdot.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = ../riscv64/nrm2.c DNRM2KERNEL = ../riscv64/nrm2.c From 590be3fae35d134fae156c60dc3a21d7933f8914 Mon Sep 17 00:00:00 2001 From: Zhaofeng Li Date: Mon, 7 Jun 2021 22:55:56 +0000 Subject: [PATCH 251/681] riscv64: Add Makefile --- kernel/riscv64/Makefile | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/riscv64/Makefile diff --git a/kernel/riscv64/Makefile b/kernel/riscv64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/riscv64/Makefile @@ -0,0 +1 @@ +clean :: From 706a08d4a01b28bc6445193dbf385260047cd0b6 Mon Sep 17 00:00:00 2001 From: "Ma, Yu" Date: Tue, 8 Jun 2021 15:08:28 -0400 Subject: [PATCH 252/681] Optimized sgemv_t for small N based on AVX512 --- kernel/x86_64/sgemv_t_4.c | 36 +- kernel/x86_64/sgemv_t_microk_skylakex.c | 60 + .../x86_64/sgemv_t_microk_skylakex_template.c | 1120 +++++++++++++++++ 3 files changed, 1215 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sgemv_t_microk_skylakex.c create mode 100644 kernel/x86_64/sgemv_t_microk_skylakex_template.c diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fe886f57f..a36c8ace9 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#include "sgemv_t_microk_haswell-4.c" +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( m < 1 ) return(0); if ( n < 1 ) return(0); + #ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL + if (lda == m && n <= 16384 && m <= 8) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 +#include "common.h" +#include +#include "sgemv_t_microk_skylakex_template.c" + +//sgemv_t: +// ----- m ----- +// |<----------- +// |<----------- +// n +// |<----------- +// |<----------- + +static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y) +{ + switch(m) { + case 1: sgemv_kernel_t_1(n, alpha, a, x, y); break; + case 2: sgemv_kernel_t_2(n, alpha, a, x, y); break; + case 3: sgemv_kernel_t_3(n, alpha, a, x, y); break; + case 4: sgemv_kernel_t_4(n, alpha, a, x, y); break; + case 5: sgemv_kernel_t_5(n, alpha, a, x, y); break; + case 6: sgemv_kernel_t_6(n, alpha, a, x, y); break; + case 7: sgemv_kernel_t_7(n, alpha, a, x, y); break; + case 8: sgemv_kernel_t_8(n, alpha, a, x, y); break; + default: break; + } + return 0; +} + +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c new file mode 100644 index 000000000..34415054c --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -0,0 +1,1120 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +//Here the m means n in sgemv_t: +// ----- n ----- +// | +// | +// m +// | +// | +static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + //printf("enter into t_1 kernel\n"); + //printf("m = %ld\n", m); + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + float alphaX = alpha * (*x); + __m512 ALPHAXVECTOR = _mm512_set1_ps(alphaX); + + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_m + 112]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(matrixArray_4, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(matrixArray_5, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(matrixArray_6, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(matrixArray_7, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + + } + + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + + } + + if (tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + + } + + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[tag_m_16x]); + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + + + } + } + } + + return 0; +} + +static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, m3, col0_1, col0_2, col1_1, col1_2, x1Array, x2Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*2 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*2 + 48]); + col0_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col0_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + col1_1 = _mm512_permutex2var_ps(m2, idx_base_0, m3); + col1_2 = _mm512_permutex2var_ps(m2, idx_base_1, m3); + + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col0_2, _mm512_mul_ps(col0_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m + 16]))); + } + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_loadu_ps(&a[idx_m + 16]); + col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + } + if (tag_m_16x != m) { + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_storeu_ps(&y[idx_m], _mm256_add_ps(ret, _mm256_loadu_ps(&y[idx_m]))); + + } + + if (tag_m_8x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-((m-tag_m_8x)*2)&15)); + __mmask16 a_mask = *((__mmask16*) &tail_mask_value); + unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); + __mmask8 y_mask = *((__mmask8*) &y_mask_value); + + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_mask_storeu_ps(&y[tag_m_8x], y_mask, _mm256_add_ps(ret, _mm256_maskz_loadu_ps(y_mask, &y[tag_m_8x]))); + } + } + } + return 0; +} + +static int sgemv_kernel_t_3(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, c1, c2, c3, tmp, x1Array, x2Array, x3Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + float x3a = x[2] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + x3Array = _mm512_set1_ps(x3a); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_s1 = _mm512_set1_epi32(-1); + __m512i idx_c1_1 = _mm512_set_epi32(0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i idx_c2_1 = _mm512_add_epi32(idx_c1_1, M512_EPI32_1); + __m512i idx_c3_1 = _mm512_add_epi32(idx_c2_1, M512_EPI32_1); + + __m512i idx_c3_2 = _mm512_set_epi32(31, 28, 25, 22, 19, 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i idx_c2_2 = _mm512_add_epi32(idx_c3_2, M512_EPI32_s1); + __m512i idx_c1_2 = _mm512_add_epi32(idx_c2_2, M512_EPI32_s1); + + __mmask16 step_1 = 0x07ff; + __mmask16 step_2 = 0xf800; + __mmask16 c31 = 0x03ff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*3]); + m1 = _mm512_loadu_ps(&a[idx_m*3 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*3 + 32]); + + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c1_1, m1); + c1 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c1_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c2_1, m1); + c2 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c2_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, c31, idx_c3_1, m1); + c3 = _mm512_permutex2var_ps(tmp, idx_c3_2, m2); + + tmp = _mm512_fmadd_ps(x2Array, c2, _mm512_mul_ps(c1, x1Array)); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x3Array, c3, tmp), _mm512_loadu_ps(&y[idx_m]))); + } + + if(tag_m_16x != m) { + __mmask8 a_mask = 0xff; + __m256i M256_EPI32_1 = _mm256_maskz_set1_epi32(a_mask, 1); + __m256i M256_EPI32_s1 = _mm256_maskz_set1_epi32(a_mask, -1); + __m256i idx_c1_1 = _mm256_set_epi32(0, 0, 15, 12, 9, 6, 3, 0); + __m256i idx_c2_1 = _mm256_add_epi32(idx_c1_1, M256_EPI32_1); + __m256i idx_c3_1 = _mm256_add_epi32(idx_c2_1, M256_EPI32_1); + + __m256i idx_c3_2 = _mm256_set_epi32(15, 12, 9, 0, 0, 0, 0, 0); + __m256i idx_c2_2 = _mm256_add_epi32(idx_c3_2, M256_EPI32_s1); + __m256i idx_c1_2 = _mm256_add_epi32(idx_c2_2, M256_EPI32_s1); + + __mmask8 step_1 = 0x1f; + __mmask8 step_2 = 0xe0; + __mmask8 c12 = 0xc0; + + __m256 m256_0, m256_1, m256_2, tmp256, c256_1, c256_2, c256_3, x256_1, x256_2, x256_3; + x256_1 = _mm256_set1_ps(x1a); + x256_2 = _mm256_set1_ps(x2a); + x256_3 = _mm256_set1_ps(x3a); + + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m256_0 = _mm256_loadu_ps(&a[idx_m*3]); + m256_1 = _mm256_loadu_ps(&a[idx_m*3 + 8]); + m256_2 = _mm256_loadu_ps(&a[idx_m*3 + 16]); + + tmp256 = _mm256_permutex2var_ps(m256_0, idx_c1_1, m256_1); + c256_1 = _mm256_mask_permutex2var_ps(tmp256, c12, idx_c1_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c2_1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c2_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c3_1, m256_1); + c256_3 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c3_2, m256_2); + + tmp256 = _mm256_fmadd_ps(x256_2, c256_2, _mm256_mul_ps(c256_1, x256_1)); + _mm256_storeu_ps(&y[idx_m], _mm256_maskz_add_ps(a_mask, _mm256_fmadd_ps(x256_3, c256_3, tmp256), _mm256_loadu_ps(&y[idx_m]))); + } + + if(tag_m_8x != m){ + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m0 = _mm512_maskz_loadu_ps(0x0fff, &a[tag_m_8x*3]); + m256_0 = _mm512_extractf32x8_ps(m0, 0); + m256_1 = _mm512_extractf32x8_ps(m0, 1); + __m256i idx1 = _mm256_set_epi32(10, 7, 4, 1, 9, 6, 3, 0); + __m256i M256_EPI32_2 = _mm256_maskz_set1_epi32(0x0f, 2); + __m256i idx2 = _mm256_add_epi32(idx1, M256_EPI32_2); + + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0x0f, idx2, m256_1); + + __m128 c128_1 = _mm256_extractf32x4_ps(c256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(c256_1, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(c256_2, 0); + + __m128 x128_1 = _mm_set1_ps(x1a); + __m128 x128_2 = _mm_set1_ps(x2a); + __m128 x128_3 = _mm_set1_ps(x3a); + + __m128 tmp128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_2, x128_2)); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, _mm_maskz_add_ps(0x0f, _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, tmp128), _mm_maskz_loadu_ps(0x0f, &y[idx_m]))); + } + + if(tag_m_4x != m) { + for (BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*3]); + __m128 a128_1 = _mm256_extractf32x4_ps(m256_0, 0); + __m128 a128_2 = _mm256_extractf32x4_ps(m256_0, 1); + __m128 x128 = _mm_maskz_loadu_ps(0x07, x); + + __m128i idx128_1= _mm_set_epi32(0, 2, 1, 0); + __m128i M128_EPI32_3 = _mm_maskz_set1_epi32(0x07, 3); + __m128i idx128_2 = _mm_add_epi32(idx128_1, M128_EPI32_3); + + __m128 c128_1 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_1, a128_2); + __m128 c128_2 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_2, a128_2); + + __m128 tmp128 = _mm_hadd_ps(_mm_maskz_mul_ps(0x07, c128_1, x128), _mm_maskz_mul_ps(0x07, c128_2, x128)); + float ret[4]; + _mm_mask_storeu_ps(ret, 0x0f, tmp128); + y[idx_m] += alpha *(ret[0] + ret[1]); + y[idx_m+1] += alpha * (ret[2] + ret[3]); + } + + if(tag_m_2x != m) { + y[tag_m_2x] += alpha*(a[tag_m_2x*3]*x[0] + a[tag_m_2x*3+1]*x[1] + a[tag_m_2x*3+2]*x[2]); + } + } + } + } + + return 0; +} + +static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2; + __m256 m256_0, m256_1, c256_1, c256_2; + __m128 c1, c2, c3, c4, ret; + __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); + __m512 x512 = _mm512_broadcast_f32x4(xarray); + __m512 alphavector = _mm512_set1_ps(alpha); + __m512 xa512 = _mm512_mul_ps(x512, alphavector); + __m256i idx1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i idx2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + + + for (BLASLONG idx_m = 0; idx_m < tag_m_4x; idx_m+=4) { + m0 = _mm512_loadu_ps(&a[idx_m*4]); + m1 = _mm512_mul_ps(m0, xa512); + m256_0 = _mm512_extractf32x8_ps(m1, 0); + m256_1 = _mm512_extractf32x8_ps(m1, 1); + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx2, m256_1); + + c1 = _mm256_extractf32x4_ps(c256_1, 0); + c2 = _mm256_extractf32x4_ps(c256_1, 1); + c3 = _mm256_extractf32x4_ps(c256_2, 0); + c4 = _mm256_extractf32x4_ps(c256_2, 1); + + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y)); + _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); + } + + if(tag_m_4x != m) { + float result[4]; + for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0xff, &a[idx_m*4]); + c1 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 0); + c2 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 1); + + c3 = _mm_maskz_mul_ps(0x0f, c1, xarray); + c4 = _mm_maskz_mul_ps(0x0f, c2, xarray); + + ret = _mm_hadd_ps(c3, c4); + _mm_mask_storeu_ps(result, 0x0f, ret); + y[idx_m] += alpha *(result[0] + result[1]); + y[idx_m+1] += alpha * (result[2] + result[3]); + } + + if(tag_m_2x != m ) { + c1 = _mm_maskz_loadu_ps(0x0f, &a[tag_m_2x * 4]); + c2 = _mm_maskz_mul_ps(0x0f, c1, xarray); + _mm_mask_storeu_ps(result, 0x0f, c2); + y[tag_m_2x] += alpha *(result[0] + result[1] + result[2] + result[3]); + } + } + + return 0; +} + +static int sgemv_kernel_t_5(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2, m3, m4, tmp0, tmp1, tmp2, accum, c0, c1, c2, c3, c4; + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + + __m512i idx_c0 = _mm512_set_epi32(27, 22, 17, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0040, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x1000, idx_c4, M512_EPI32_16); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*5 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*5 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*5 + 64]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c0, m3); + c0 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c0 = _mm512_mask_permutex2var_ps(c0, 0xe000, idx_c0, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c1, m3); + c1 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c1 = _mm512_mask_permutex2var_ps(c1, 0xe000, idx_c1, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c2, m3); + c2 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c2 = _mm512_mask_permutex2var_ps(c2, 0xe000, idx_c2, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c3, m3); + c3 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c3 = _mm512_mask_permutex2var_ps(c3, 0xe000, idx_c3, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x0fc0, m2, idx_c4, m3); + c4 = _mm512_mask_blend_ps(0x0fc0, tmp0, tmp1); + c4 = _mm512_mask_permutex2var_ps(c4, 0xf000, idx_c4, m4); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + + } + if(tag_m_16x !=m) { + __m512i idx_c0c2 = _mm512_set_epi32(0, 0, 27, 22, 17, 12, 7, 2 , 0, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1c3 = _mm512_add_epi32(idx_c0c2, M512_EPI32_1); + idx_c4 = _mm512_add_epi32(idx_c1c3, M512_EPI32_1); + __m256i idx_c0m4 = _mm256_set_epi32(11, 6, 0, 0, 0, 0, 0, 0); + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_c1m4 = _mm256_add_epi32(idx_c0m4, M256_EPI32_1); + __m256i idx_c2m4 = _mm256_add_epi32(idx_c1m4, M256_EPI32_1); + __m256i idx_c3m4 = _mm256_add_epi32(idx_c2m4, M256_EPI32_1); + __m256i idx_c4m4 = _mm256_add_epi32(idx_c3m4, M256_EPI32_1); + //TODO: below can change to use extract to decrease the latency + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256, m256_4; + + for(BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m256_4 = _mm256_loadu_ps(&a[idx_m*5 + 32]); + tmp0 = _mm512_permutex2var_ps(m0, idx_c0c2, m1); + tmp1 = _mm512_permutex2var_ps(m0, idx_c1c3, m1); + tmp2 = _mm512_permutex2var_ps(m0, idx_c4, m1); + + __m256 c256_0 = _mm512_extractf32x8_ps(tmp0, 0); + __m256 c256_2 = _mm512_extractf32x8_ps(tmp0, 1); + __m256 c256_1 = _mm512_extractf32x8_ps(tmp1, 0); + __m256 c256_3 = _mm512_extractf32x8_ps(tmp1, 1); + __m256 c256_4 = _mm512_extractf32x8_ps(tmp2, 1); + + c256_0 = _mm256_mask_permutex2var_ps(c256_0, 0x80, idx_c0m4, m256_4); + c256_1 = _mm256_mask_permutex2var_ps(c256_1, 0x80, idx_c1m4, m256_4); + c256_2 = _mm256_mask_permutex2var_ps(c256_2, 0xc0, idx_c2m4, m256_4); + c256_3 = _mm256_mask_permutex2var_ps(c256_3, 0xc0, idx_c3m4, m256_4); + c256_4 = _mm256_mask_permutex2var_ps(c256_4, 0xc0, idx_c4m4, m256_4); + + accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256)); + accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256); + accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m])); + _mm256_storeu_ps(&y[idx_m], accum_256); + } + if(tag_m_8x != m) { + __m256i idx_c02 = _mm256_set_epi32(17, 12, 7, 2, 15, 10, 5, 0); + __m256i idx_c13 = _mm256_add_epi32(idx_c02, M256_EPI32_1); + __m256i idx_4 = _mm256_add_epi32(idx_c13, M256_EPI32_1); + __m128 accum_128; + __m256 m256_0, m256_1, tmp256_0, tmp256_1; + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m256_0 = _mm256_loadu_ps(&a[idx_m*5]); + m256_1 = _mm256_loadu_ps(&a[idx_m*5 + 8]); + __m128 m128_4 = _mm_maskz_loadu_ps(0x0f, &a[idx_m*5 + 16]); + + tmp256_0 = _mm256_permutex2var_ps(m256_0, idx_c02, m256_1); + tmp256_1 = _mm256_permutex2var_ps(m256_0, idx_c13, m256_1); + __m256 tmp256_2 = _mm256_maskz_permutex2var_ps(0xf0, m256_0, idx_4, m256_1); + + __m128 c128_0 = _mm256_extractf32x4_ps(tmp256_0, 0); + __m128 c128_1 = _mm256_extractf32x4_ps(tmp256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(tmp256_0, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(tmp256_1, 1); + __m128 c128_4 = _mm256_extractf32x4_ps(tmp256_2, 1); + + __m128i idx_c14 = _mm_set_epi32(4, 0, 0, 0); + __m128i M128_EPI32_1 = _mm_set1_epi32(1); + __m128i idx_c24 = _mm_add_epi32(idx_c14, M128_EPI32_1); + __m128i idx_c34 = _mm_add_epi32(idx_c24, M128_EPI32_1); + __m128i idx_c44 = _mm_add_epi32(idx_c34, M128_EPI32_1); + + c128_1 = _mm_mask_permutex2var_ps(c128_1, 0x08, idx_c14, m128_4); + c128_2 = _mm_mask_permutex2var_ps(c128_2, 0x08, idx_c24, m128_4); + c128_3 = _mm_mask_permutex2var_ps(c128_3, 0x08, idx_c34, m128_4); + c128_4 = _mm_mask_permutex2var_ps(c128_4, 0x08, idx_c44, m128_4); + + __m128 x128_0 = _mm256_extractf32x4_ps(x0_256, 0); + __m128 x128_1 = _mm256_extractf32x4_ps(x1_256, 0); + __m128 x128_2 = _mm256_extractf32x4_ps(x2_256, 0); + __m128 x128_3 = _mm256_extractf32x4_ps(x3_256, 0); + __m128 x128_4 = _mm256_extractf32x4_ps(x4_256, 0); + + __m128 alpha_128 = _mm256_extractf32x4_ps(alpha256, 0); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_0, x128_0)); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_2, x128_2, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_4, x128_4, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, accum_128, alpha_128, _mm_maskz_loadu_ps(0x0f, &y[idx_m])); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, accum_128); + + } + + if(tag_m_4x !=m ){ + x0_256 = _mm256_maskz_loadu_ps(0x1f, x); + x0_256 = _mm256_mul_ps(x0_256, alpha256); + float ret8[8]; + + for(BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5]); + m256_1 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5 + 5]); + + m256_0 = _mm256_mul_ps(m256_0, x0_256); + m256_1 = _mm256_mul_ps(m256_1, x0_256); + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[idx_m] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + _mm256_mask_storeu_ps(ret8, 0x1f, m256_1); + y[idx_m+1] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + + if(tag_m_2x != m){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[tag_m_2x*5]); + m256_0 = _mm256_mul_ps(m256_0, x0_256); + + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[tag_m_2x] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + } + } + + } + return 0; +} + +static int sgemv_kernel_t_6(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512 m0, m1, m2, m3, m4, m5, c0, c1, c2, c3, c4, c5, tmp0, tmp1, tmp2, accum; + __m512i idx_c0 = _mm512_set_epi32(26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0020, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x0400, idx_c4, M512_EPI32_0); + __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1); + + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 x5_512 = _mm512_set1_ps(x[5]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*6]); + m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*6 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*6 + 64]); + m5 = _mm512_loadu_ps(&a[idx_m*6 + 80]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c0, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c0, m5); + c0 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c0 = _mm512_mask_blend_ps(0xf800, c0, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c1, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c1, m5); + c1 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c1 = _mm512_mask_blend_ps(0xf800, c1, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c2, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c2, m5); + c2 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c2 = _mm512_mask_blend_ps(0xf800, c2, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c3, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c3, m5); + c3 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c3 = _mm512_mask_blend_ps(0xf800, c3, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c4, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c4, m5); + c4 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c4 = _mm512_mask_blend_ps(0xfc00, c4, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c5 , m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c5 , m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c5 , m5); + c5 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c5 = _mm512_mask_blend_ps(0xfc00, c5, tmp2); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(c5, x5_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + } + + if(tag_m_16x != m) { + __m512i idx_c0c3 = _mm512_set_epi32(29, 23, 17, 27, 21, 15, 9, 3, 26, 20, 30, 24, 18, 12, 6, 0); + __m512i idx_c1c4 = _mm512_add_epi32(idx_c0c3, M512_EPI32_1); + __m512i idx_c2c5 = _mm512_add_epi32(idx_c1c4, M512_EPI32_1); + idx_c2c5 = _mm512_mask_blend_epi32(0x0020, idx_c2c5, M512_EPI32_16); + __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5; + + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 x5_256 = _mm256_set1_ps(x[5]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256; + + for(BLASLONG idx_m = tag_m_16x; idx_m Date: Wed, 9 Jun 2021 12:20:09 -0500 Subject: [PATCH 253/681] POWER10: Fixes for sbgemm kernel While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to array access beyond the boundary. --- kernel/power/sbgemm_kernel_power10.c | 136 ++++++++++++++------------- 1 file changed, 71 insertions(+), 65 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index d15586703..74f3eac4c 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -49,17 +49,11 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -vector char mask = - { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, - 0xf -}; - /* * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of * bfloat16 floating-point values as input. Hence this * merging is needed on A and B matrices. */ -#define MERGE_ROW(x) vec_perm(x, x, mask) #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) @@ -179,8 +173,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); @@ -231,8 +225,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); MMA (&acc0, rowB_h, rowA_h); @@ -271,8 +265,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE_ACC (&acc0, 0); SAVE_ACC1 (&acc1, 0); @@ -306,8 +300,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 2)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -338,8 +332,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 3)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -387,16 +381,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -436,12 +430,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -475,9 +469,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); } SAVE_ACC (&acc0, 0); SAVE_ACC (&acc1, 4); @@ -505,8 +500,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vector short rowA = { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; - vec_t *rowB = (vec_t *) & (BO[l]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE_ACC (&acc0, 0); CO += 4; @@ -536,8 +532,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0, + BO[(l<<1) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); CO += 2; @@ -566,8 +565,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 2]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0, + BO[(l<<2) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); AO += k; @@ -620,14 +622,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowA1 = (vec_t *) & (A1[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -669,10 +671,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero )); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -708,8 +710,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -740,8 +742,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[l << 1]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2], + 0, AO[(l << 1) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } SAVE2x4_ACC (&acc0, 0); CO += 4; @@ -829,10 +833,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 4)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -871,8 +875,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 3)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -904,8 +908,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 , + AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); From 7fb6e576c254864f1b8990655dcc28b524f23c2f Mon Sep 17 00:00:00 2001 From: Arthur Williams Date: Wed, 9 Jun 2021 20:50:11 -0500 Subject: [PATCH 254/681] Removed use of non portable '-p' arg to install Not all versions of install support '-p' flag and it isn't worth failing the build in the installed files' timestamps get updated. --- Makefile.install | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile.install b/Makefile.install index e8b64465f..28727de37 100644 --- a/Makefile.install +++ b/Makefile.install @@ -74,17 +74,17 @@ endif ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif @@ -92,7 +92,7 @@ endif ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) From 7dfc45e840ba8c10d5564a700f54deed0303e3b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:09:50 +0200 Subject: [PATCH 255/681] Remove casts for PPC/POWER and complete parameters for POWER3/4 --- param.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/param.h b/param.h index a35ce69bd..ddad2fb36 100644 --- a/param.h +++ b/param.h @@ -72,13 +72,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define LONGCAST (BLASLONG) -#if defined(__BYTE_ORDER__) -#if __GNUC__ < 9 -#undef LONGCAST -#define LONGCAST -#endif -#endif #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 @@ -2096,7 +2089,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef PPCG4 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2127,7 +2120,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 2688 #define GEMM_DEFAULT_OFFSET_B 3072 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2176,7 +2169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2212,7 +2205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A (32 * 0) #define GEMM_DEFAULT_OFFSET_B (32 * 0) -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2247,7 +2240,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER3) || defined(POWER4) || defined(POWER5) #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2271,6 +2264,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 216 #define DGEMM_DEFAULT_R 1012 +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 104 +#define CGEMM_DEFAULT_R 1012 + #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 @@ -2288,6 +2285,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 144 #define ZGEMM_DEFAULT_P 144 #endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 #endif #if defined(POWER5) @@ -2320,7 +2322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2353,7 +2355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -2406,7 +2408,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 @@ -2445,7 +2447,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 16 From 7a48247761be4caf9030bfc0d5863558a28787b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:11:56 +0200 Subject: [PATCH 256/681] fix c/zrot and sgemv for POWER5 --- kernel/power/KERNEL.POWER5 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 index fbef79e59..bea7b17c8 100644 --- a/kernel/power/KERNEL.POWER5 +++ b/kernel/power/KERNEL.POWER5 @@ -54,3 +54,8 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c From dc4fcb48df01a21be9d96c70c8ff66258fefd728 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:14:03 +0200 Subject: [PATCH 257/681] Fix inverted conditional for caxpy/zaxpy --- kernel/power/KERNEL.PPC440 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 677af5f21..fd9a8c780 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -16,11 +16,11 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c -else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c endif SDOTKERNEL = dot_ppc440.S From fb9e678235a2e7ee7ce3a48263726d03b9827187 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:15:48 +0200 Subject: [PATCH 258/681] Fix caxpy/zaxpy for big-endian --- kernel/power/KERNEL.PPCG4 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 54660b54d..1bdd3119e 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -15,8 +15,13 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S From 08e2e60762b2b594a81479b766276224c4ae6bed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:17:33 +0200 Subject: [PATCH 259/681] Add prefetch values for power3 --- kernel/power/gemv_n.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index abc61b62e..9c6f87639 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -159,6 +159,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 From 8adf0971d801a43d799a57b5721aedc7dec3e68d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:18:22 +0200 Subject: [PATCH 260/681] Add prefetch values for power3 --- kernel/power/gemv_t.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 25a4dd01b..accdad702 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -124,6 +124,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 48 #define PREFETCHSIZE_C 16 From 3906ef3b0fb19e7436f2b4cf6394b11f3466b1f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:19:40 +0200 Subject: [PATCH 261/681] Add prefetch values for power3 --- kernel/power/zgemv_t.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index d82fab16a..314cf5e6e 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -129,6 +129,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 From efdbdd8f8254988a851e7759277fb8d38d319c84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Jun 2021 11:20:29 +0200 Subject: [PATCH 262/681] Add prefetch values for power3 --- kernel/power/zgemv_n.S | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 708f1318d..48f49f97b 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -155,6 +155,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 From 9d292d37b2cea829763f8f6bf8e5f4053bbf2a00 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 14 Jun 2021 17:01:28 +0900 Subject: [PATCH 263/681] arm64: add the missing d9 register to the clobber list Refs. numpy/numpy#18422 Signed-off-by: Gilles Gouaillardet --- kernel/arm64/dznrm2_thunderx2t99.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b021a2832..fba2fe8ce 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, : "cc", "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF ); } From e6dd44d98976a34baa447886753dd6b7ec31c380 Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Tue, 15 Jun 2021 13:07:47 -0500 Subject: [PATCH 264/681] Power10: Fix for SBGEMM While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to updating result for additional bytes. --- kernel/power/sbgemm_kernel_power10.c | 34 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 74f3eac4c..134929ec1 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -98,6 +98,30 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC[0] += result[6] * alpha; + #define SAVE4x2_ACC_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[0 * ldc] += res[0][0]; \ + CO[1 * ldc] += res[1][0]; \ + CO[2 * ldc] += res[2][0]; \ + CO[3 * ldc] += res[3][0]; \ + } + + #define SAVE4x2_ACC1_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[4 * ldc] += res[0][0]; \ + CO[5 * ldc] += res[1][0]; \ + CO[6 * ldc] += res[2][0]; \ + CO[7 * ldc] += res[3][0]; \ +} + #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ @@ -313,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); __builtin_mma_xxsetaccz (&acc1); @@ -335,8 +359,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); - SAVE4x2_ACC1 (&acc1, 0); + SAVE4x2_ACC_SCALAR (&acc0); + SAVE4x2_ACC1_SCALAR (&acc1); CO += 1; AO += k; BO += (k << 3); @@ -547,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; __builtin_mma_xxsetaccz (&acc0); @@ -571,7 +595,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, }; MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC_SCALAR (&acc0); AO += k; BO += (k << 2); CO += 1; From 92e024bbb30d4445ce48be982d2625cac3c1df49 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:33:23 +0200 Subject: [PATCH 265/681] Declare SCASUM as EXTERNAL --- lapack-netlib/TESTING/EIG/cbdt05.f | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f index 5a08ccce3..4ed157431 100644 --- a/lapack-netlib/TESTING/EIG/cbdt05.f +++ b/lapack-netlib/TESTING/EIG/cbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SLAMCH, CLANGE - EXTERNAL LSAME, ISAMAX, SASUM, SLAMCH, CLANGE - REAL SCASUM + REAL SASUM, SCASUM, SLAMCH, CLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SCASUM, SLAMCH, CLANGE * .. * .. External Subroutines .. EXTERNAL CGEMM From 52693481784bbafba40edb2671540cccdb4c387e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:35:12 +0200 Subject: [PATCH 266/681] Declare CSROT as EXTERNAL --- lapack-netlib/TESTING/EIG/cckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/cckcsd.f b/lapack-netlib/TESTING/EIG/cckcsd.f index 9783f0361..9524cb30b 100644 --- a/lapack-netlib/TESTING/EIG/cckcsd.f +++ b/lapack-netlib/TESTING/EIG/cckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, CCSDTS, CLACSG, CLAROR, - $ CLASET + $ CLASET, CSROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From 9e1b43ea9b12fba1768d2b095149523704af76bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:39:28 +0200 Subject: [PATCH 267/681] Declare DROT as EXTERNAL --- lapack-netlib/TESTING/EIG/dckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/dckcsd.f b/lapack-netlib/TESTING/EIG/dckcsd.f index 50db6baa0..063a5ef5c 100644 --- a/lapack-netlib/TESTING/EIG/dckcsd.f +++ b/lapack-netlib/TESTING/EIG/dckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, DCSDTS, DLACSG, DLAROR, - $ DLASET + $ DLASET, DROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From e2621ef93ae32b0fef33437c91ed774aa469277a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:40:15 +0200 Subject: [PATCH 268/681] Declare SROT as EXTERNAL --- lapack-netlib/TESTING/EIG/sckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/sckcsd.f b/lapack-netlib/TESTING/EIG/sckcsd.f index 5a6e4a099..be91eed51 100644 --- a/lapack-netlib/TESTING/EIG/sckcsd.f +++ b/lapack-netlib/TESTING/EIG/sckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, SCSDTS, SLACSG, SLAROR, - $ SLASET + $ SLASET, SROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From cd0e4aadb1ee504371ba6fd516dcd5a3d9b65e95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:41:18 +0200 Subject: [PATCH 269/681] Declare ZDROT as EXTERNAL --- lapack-netlib/TESTING/EIG/zckcsd.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zckcsd.f b/lapack-netlib/TESTING/EIG/zckcsd.f index f77b111a4..92760337c 100644 --- a/lapack-netlib/TESTING/EIG/zckcsd.f +++ b/lapack-netlib/TESTING/EIG/zckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, ZCSDTS, ZLACSG, ZLAROR, - $ ZLASET + $ ZLASET, ZDROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN From 5958ffc9b6b01046f160c6afa085444cb3b0204a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 09:43:39 +0200 Subject: [PATCH 270/681] Declare DZASUM as EXTERNAL --- lapack-netlib/TESTING/EIG/zbdt05.f | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f index bbf0208b7..f262351e4 100644 --- a/lapack-netlib/TESTING/EIG/zbdt05.f +++ b/lapack-netlib/TESTING/EIG/zbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DLAMCH, ZLANGE - EXTERNAL LSAME, IDAMAX, DASUM, DLAMCH, ZLANGE - DOUBLE PRECISION DZASUM + DOUBLE PRECISION DASUM, DZASUM, DLAMCH, ZLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DZASUM, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEMM From 13fa9f737d11b5d59d7b941dadd51d8f9be25c52 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 12:17:25 +0200 Subject: [PATCH 271/681] Modify defines for CR and RC to work around name collision on Windows --- cmake/utils.cmake | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 29b5a067b..794d73d06 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in) # now add the object and set the defines set(obj_defines ${defines_in}) + list(FIND obj_defines "RC" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "RC") + list(APPEND obj_defines "RC=RC") + endif () + list(FIND obj_defines "CR" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "CR") + list(APPEND obj_defines "CR=CR") + endif () + if (use_cblas) set(obj_name "cblas_${obj_name}") list(APPEND obj_defines "CBLAS") From e83df9397581dc5413bcf36e9e29d5fdb3f68f70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 12:32:34 +0200 Subject: [PATCH 272/681] Work around another recent macro name collision with winnt.h --- driver/level3/Makefile | 48 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 78f32b961..e893d915e 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC)RC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c - $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) @@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) From 307c4c078692f79ac4e064668aacfadc31496b41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Jun 2021 13:41:16 +0200 Subject: [PATCH 273/681] Fix typo --- driver/level3/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e893d915e..b8465d4ed 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC)RC $< -o $(@F) + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) From b7da75e4fdc05976600949d588912cc9a6b9d22e Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 21:37:51 +0200 Subject: [PATCH 274/681] WiP CORTEX A55 support --- KERNEL.CORTEXA55 | 196 +++++++++++++++++++++++++++++++++++++++++++++++ Makefile.arm64 | 22 ++++++ Makefile.system | 1 + TargetList.txt | 1 + cpuid_arm64.c | 11 ++- getarch.c | 15 ++++ param.h | 2 +- 7 files changed, 245 insertions(+), 3 deletions(-) create mode 100644 KERNEL.CORTEXA55 diff --git a/KERNEL.CORTEXA55 b/KERNEL.CORTEXA55 new file mode 100644 index 000000000..db322dd0d --- /dev/null +++ b/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/Makefile.arm64 b/Makefile.arm64 index 3858d7e3f..c23a0876e 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -57,6 +57,28 @@ endif endif endif +# Use a53 tunings because a55 is only available in GCC>=8.1 +ifeq ($(CORE), CORTEXA55) +ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq ($(GCCVERSIONGTEQ8), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +endif +endif +endif + ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 2264b143b..0cd3e3a7c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,7 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/TargetList.txt b/TargetList.txt index d19964916..f93a629d8 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -92,6 +92,7 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +CORTEXA55 EMAG8180 FALKOR THUNDERX diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5f5d7771b..a150301d1 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -36,6 +36,7 @@ size_t length=sizeof(value); #define CPU_ARMV8 1 // Arm #define CPU_CORTEXA53 2 +#define CPU_CORTEXA55 14 #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 @@ -67,7 +68,8 @@ static char *cpuname[] = { "EMAG8180", "NEOVERSEN1", "THUNDERX3T110", - "VORTEX" + "VORTEX", + "CORTEXA55" }; static char *cpuname_lower[] = { @@ -84,7 +86,8 @@ static char *cpuname_lower[] = { "emag8180", "neoversen1", "thunderx3t110", - "vortex" + "vortex", + "cortexa55" }; int get_feature(char *search) @@ -161,6 +164,8 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd05")) + return CPU_CORTEXA55; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -284,6 +289,8 @@ void get_cpuconfig(void) printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: + // case CPU_CORTEXA53; + // case CPU_CORTEXA55; // Minimum parameters for ARMv8 (based on A53) printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); diff --git a/getarch.c b/getarch.c index f48944f36..3bc8a0c3d 100644 --- a/getarch.c +++ b/getarch.c @@ -1159,6 +1159,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_CORTEXA55 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA55" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA55 " \ + "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa55" +#define CORENAME "CORTEXA55" +#else +#endif #ifdef FORCE_FALKOR #define FORCE diff --git a/param.h b/param.h index ddad2fb36..01048023f 100644 --- a/param.h +++ b/param.h @@ -2959,7 +2959,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 -#elif defined(CORTEXA53) +#elif defined(CORTEXA53) || defined(CORTEXA55) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 From 39ef0880aea439d199c99031b59dd9bd9225b69d Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 21:49:58 +0200 Subject: [PATCH 275/681] copy conf --- kernel/arm64/KERNEL.CORTEXA55 | 196 ++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 kernel/arm64/KERNEL.CORTEXA55 diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 new file mode 100644 index 000000000..db322dd0d --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From 9335d427406b5f14f545a17408d1e1ae0d666790 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 19 Jun 2021 22:21:39 +0200 Subject: [PATCH 276/681] add gcc8 version matching --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index 0cd3e3a7c..bb8c60e91 100644 --- a/Makefile.system +++ b/Makefile.system @@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) From 6423b282a1f95bdaa69d87b4a6302532a0ef1f83 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 14:19:41 +0200 Subject: [PATCH 277/681] dynamic_arch --- driver/others/dynamic_arm64.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0b623c3ac..1bec91462 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #else #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 #endif +#ifdef DYN_CORTEX_A55 +extern gotoblas_t gotoblas_CORTEXA55; +#else +#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#endif #else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; @@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 12 +#define NUM_CORETYPES 13 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -142,6 +148,7 @@ static char *corename[] = { "emag8180", "neoversen1", "thunderx3t110", + "cortexa55", "unknown" }; @@ -158,6 +165,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; return corename[NUM_CORETYPES]; } @@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); + case 12: return (&gotoblas_CORTEXA55); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA73; case 0xd0c: // Neoverse N1 return &gotoblas_NEOVERSEN1; + case 0xd05: // Cortex A55 + return &gotoblas_CORTEXA55; } break; case 0x42: // Broadcom From 548aa522e5be12f21eabeb66154f315047c92dc2 Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 15:29:25 +0200 Subject: [PATCH 278/681] remove misplaced file --- KERNEL.CORTEXA55 | 196 ----------------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 KERNEL.CORTEXA55 diff --git a/KERNEL.CORTEXA55 b/KERNEL.CORTEXA55 deleted file mode 100644 index db322dd0d..000000000 --- a/KERNEL.CORTEXA55 +++ /dev/null @@ -1,196 +0,0 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif -DSDOTKERNEL = dot.S - -DGEMM_BETA = dgemm_beta.S -SGEMM_BETA = sgemm_beta.S - -ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S -else -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -endif -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From 91e2b11d3cf423fd16e8081d34ea12e5fb032bdb Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 15:32:42 +0200 Subject: [PATCH 279/681] add to cmake listings too --- cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 2 +- cmake/system.cmake | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 4451f9eaa..154e59db6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index da7686c33..d86e10035 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index d6c71b774..34874827c 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () - if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () endif () From 750719528a624295e708fdd8ca31c42e5186120c Mon Sep 17 00:00:00 2001 From: User User-User Date: Sun, 20 Jun 2021 16:40:43 +0200 Subject: [PATCH 280/681] bugz --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 1bec91462..04ceaaf6d 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -102,7 +102,7 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else -#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#define gotoblas_CORTEXA55 gotoblas_ARMV8 #endif #else extern gotoblas_t gotoblas_CORTEXA53; From 130327e9af42ee405afe69cd63eef7707bc454a8 Mon Sep 17 00:00:00 2001 From: User User-User Date: Tue, 22 Jun 2021 23:58:59 +0200 Subject: [PATCH 281/681] OK --- cpuid_arm64.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a150301d1..041b04311 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -289,8 +289,6 @@ void get_cpuconfig(void) printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: - // case CPU_CORTEXA53; - // case CPU_CORTEXA55; // Minimum parameters for ARMv8 (based on A53) printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_LINESIZE 64\n"); From f0b822a7094e62fa187426029305acfc30772d8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jun 2021 10:11:01 +0200 Subject: [PATCH 282/681] Update cpuid_arm64.c --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 041b04311..2a9399f7d 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -286,6 +286,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA53: + case CPU_CORTEXA55: printf("#define %s\n", cpuname[d]); // Fall-through case CPU_ARMV8: From 3be660c0000606743ec0e747228f73435d190e8b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:44:56 +0200 Subject: [PATCH 283/681] Add interface declarations for ?potri --- common_interface.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common_interface.h b/common_interface.h index b9ebb2772..318827920 100644 --- a/common_interface.h +++ b/common_interface.h @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); From 1f8bda71b9b07afccb5ab255d4a6156da60420fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:46:00 +0200 Subject: [PATCH 284/681] Add OPENBLAS_LOOPS support to potrf/potrs/potri benchmark --- benchmark/potrf.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 116d0cca5..8808203a5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ char *p; char btest = 'F'; - blasint m, i, j, info, uplos=0; - double flops; + blasint m, i, j, l, info, uplos=0; + double flops = 0.; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1, timeg; argc--;argv++; @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - for(m = from; m <= to; m += step){ + for(m = from; m <= to; m += step){ + timeg=0.; + for (l = 0; l < loops; l++) { #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'F') + timeg += getsec(); if ( btest == 'S' ) { @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; - + timeg += getsec(); } if ( btest == 'I' ) @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potri info = %d\n", info); exit(1); } - - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; + timeg += getsec(); } - + } // loops + + time1 = timeg/(double)loops; + if ( btest == 'F') + flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'S') + flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; + if ( btest == 'I') + flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); From 1b5620b66e3a834932fb527cdaef6ce22ce07ed0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jun 2021 23:47:41 +0200 Subject: [PATCH 285/681] Add lower threshold for multithreading in ?potrf and ?potri --- interface/lapack/potrf.c | 7 +++++++ interface/lapack/potri.c | 3 +++ interface/lapack/zpotrf.c | 7 +++++++ interface/lapack/zpotri.c | 9 +++++++++ 4 files changed, 26 insertions(+) diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index dbd55f62f..3abc80133 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n <128) +#else + if (args.n <64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index 2c0c64b6f..eb0fcbe70 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; + if (args.n < 180) + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index c4cd99bf6..298efbbc1 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n < 64) +#else + if (args.n < 64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index 8da211683..8748c6352 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.n < 200) +#else + if (args.n < 150) +#endif + args.nthreads=1; + else +#endif + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From 6ebcce229fdbc960795fd77488dcd84baffcc205 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 29 Jun 2021 17:17:34 +0200 Subject: [PATCH 286/681] Work around current conda/tqdm auto-update problem --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index c9b2fa3a1..d575c5b7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -47,6 +47,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force + - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" From 06e3b07ecb8d06b1a30b650b00891d58294bb865 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:38:45 +0200 Subject: [PATCH 287/681] Handle OPENBLAS_LOOPS and OPENBLAS_TEST options --- benchmark/getri.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/benchmark/getri.c b/benchmark/getri.c index 98a860906..4c8891226 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; blasint *ipiv; - blasint m, i, j, info,lwork; + blasint m, i, j, l, info,lwork; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1,timeg; + + char *p; + char btest = 'I'; argc--;argv++; @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); - GETRF (&m, &m, a, &m, ipiv, &info); + for (l = 0; l < loops; l++) { + if (btest == 'F') begin(); + GETRF (&m, &m, a, &m, ipiv, &info); + if (btest == 'F') { + end(); + timeg += getsec(); + } if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - begin(); + if (btest == 'I') begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - end(); + if (btest == 'I') end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = getsec(); - + if (btest == 'I') + timeg += getsec(); + + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); From dcfc5cf714923f6d9981c9fc2cdb5ce5b846c0ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:39:37 +0200 Subject: [PATCH 288/681] Handle OPENBLAS_LOOPS for more stable results --- benchmark/linpack.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 202035245..32ccb0386 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; - blasint m, i, j, info; + blasint m, i, j, l, info; blasint unit = 1; int from = 1; int to = 200; int step = 1; + int loops = 1; FLOAT maxerr; - double time1, time2; + double time1, time2, timeg1,timeg2; + char *p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ - + timeg1 = timeg2 = 0.; fprintf(stderr, " %6d : ", (int)m); - + for (l = 0; l < loops; l++) { for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); + timeg1 += getsec(); begin(); @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ exit(1); } - time2 = getsec(); - + timeg2 += getsec(); + } //loops + time1=timeg1/(double)loops; + time2=timeg2/(double)loops; maxerr = 0.; for(i = 0; i < m; i++){ From 726c44242b6d565577e00a3c6591ffee5db005ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 17:41:05 +0200 Subject: [PATCH 289/681] Add lower threshold for multithreading --- interface/lapack/getrf.c | 9 ++++++++- interface/lapack/zgetrf.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index 02bb124b3..323370ebc 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 7f8db94f6..d03541fad 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); + if (args.m*args.n <10000) + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From 4620f988126d2e98b82fb28511fda29d27ef8bc4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 Jul 2021 19:24:35 +0200 Subject: [PATCH 290/681] Mention availability of the Windows binaries in the Releases section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 174f951f4..d7e0d60a7 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). ## Installation from Source From a4543e4918f9c732d4701315d5b22de31a79f737 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Jul 2021 16:59:43 +0200 Subject: [PATCH 291/681] Handle OPENBLAS_LOOP --- benchmark/syrk.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 82606a21a..fa0f24666 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ char uplo='U'; char trans='N'; - + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - blasint m, i, j; + blasint m, i, j, l; int from = 1; int to = 200; int step = 1; + int loops = 1; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; - double time1; + double time1,timeg; argc--;argv++; @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for(l = 0; l < loops; l++) { + for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); - + timeg += getsec(); + + } //loops + time1 = timeg / (double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); From 8186963d8c454ba65325053eebe0a4328421755f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Jul 2021 17:00:26 +0200 Subject: [PATCH 292/681] Add lower limit for multithreading --- interface/syrk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/interface/syrk.c b/interface/syrk.c index 7699db683..edb113d6c 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif args.common = NULL; +#ifndef COMPLEX +#ifdef DOUBLE + if (args.n < 100) +#else + if (args.n < 200) +#endif +#else + if (args.n < 65) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { From 3cfdb1770c0a405e3d976184a46dc4a394dc9030 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Jul 2021 20:21:07 +0200 Subject: [PATCH 293/681] Remove code that disabled EXTRALIB on RISCV C910V --- test/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 54fa60533..6c5f041c2 100644 --- a/test/Makefile +++ b/test/Makefile @@ -259,10 +259,6 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -ifeq ($(CORE), C910V) -EXTRALIB = -CEXTRALIB = -endif ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) From 0d8d261dd4936da6a11673ecaae54acb4e16ecad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 12:20:19 +0200 Subject: [PATCH 294/681] Recognize newer Zhaoxin/Centaur cpus as Nehalem --- cpuid_x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 18ff122e5..4553b89f1 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1631,7 +1631,9 @@ int get_cpuname(void){ case 0x6: return CPUTYPE_NANO; break; - + case 0x7: + return CPUTYPE_NEHALEM; + break; } return CPUTYPE_VIAC3; } @@ -2285,6 +2287,9 @@ int get_coretype(void){ case 0x6: return CORE_NANO; break; + case 0x7: + return CORE_NEHALEM; + break; } return CORE_VIAC3; } From eb2fdd3af0241759576988a4672dc76ab298538f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 12:23:15 +0200 Subject: [PATCH 295/681] Recognize newer Zhaoxin/Centaur processors as Nehalem --- driver/others/dynamic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 46ad06a7c..4212e868c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -824,6 +824,9 @@ static gotoblas_t *get_coretype(void){ switch (family) { case 0x6: return &gotoblas_NANO; + break; + case 0x7: + return &gotoblas_NEHALEM; } } From da623ae838ef8277a230004d88270b1fdb37235a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 18:26:23 +0200 Subject: [PATCH 296/681] Add vendor string Shanghai as the successor to Centaur --- cpuid_x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4553b89f1..4737b1851 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -283,6 +283,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; From 8f22ac552befbc414ce56db5c5142a7f0a5038ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Jul 2021 18:28:49 +0200 Subject: [PATCH 297/681] Add vendor string Shanghai as successor to Centaur --- driver/others/dynamic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 4212e868c..1a33870db 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -404,6 +404,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; From 2f6326a630f074489e8dcc0a53afce88fc800151 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:36:07 -0700 Subject: [PATCH 298/681] Remove --- driver/others/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 63fa6a566..6e654ccf2 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1702,7 +1702,6 @@ inline int atoi(const char *str) { return 0; } #include #include #include -#include #include #include #include From 220f6a1c556a5cc94eb7bc230e64074bdc0a6d04 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:38:02 -0700 Subject: [PATCH 299/681] Add feature test macro for proper inclusion of --- openblas_config_template.h | 1 + 1 file changed, 1 insertion(+) diff --git a/openblas_config_template.h b/openblas_config_template.h index 858b8c5cb..1e17c9a16 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,5 +99,6 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX +#define _GNU_SOURCE #include #endif From cecc2c65aad40f8f4a261ae503b92936c0b147f4 Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 00:39:52 -0700 Subject: [PATCH 300/681] Add test of installed --- Makefile | 10 ++++++++-- test_install/Makefile | 15 +++++++++++++++ test_install/test_sched_include.c | 5 +++++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 test_install/Makefile create mode 100644 test_install/test_sched_include.c diff --git a/Makefile b/Makefile index 555d1c467..d31cc9c83 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,9 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test +SUBDIRS_ALL = $(SUBDIRS) test ctest utest test_install exports benchmark ../laswp ../bench cpp_thread_test -.PHONY : all libs netlib $(RELA) test ctest shared install +.PHONY : all libs netlib $(RELA) test ctest test_install shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @@ -155,6 +155,11 @@ endif endif endif +test_install : + mkdir -p install + PREFIX=install $(MAKE) install + $(MAKE) -C test_install all + libs : ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) @@ -399,4 +404,5 @@ endif @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) + @rm -rf install @echo Done. diff --git a/test_install/Makefile b/test_install/Makefile new file mode 100644 index 000000000..29cd81322 --- /dev/null +++ b/test_install/Makefile @@ -0,0 +1,15 @@ +# +# tests of installed headers and libs +# + +INSTALLDIR = ../install + +.PHONY: all +all: test_sched_include + +test_sched_include: test_sched_include.c + $(CC) -c -I$(INSTALLDIR)/include $< + +.PHONY: clean +clean: + rm -f *.o diff --git a/test_install/test_sched_include.c b/test_install/test_sched_include.c new file mode 100644 index 000000000..aea35680d --- /dev/null +++ b/test_install/test_sched_include.c @@ -0,0 +1,5 @@ +// tests that inclusion of openblas_config.h works with musl + +#include + +cpu_set_t* cpu_set = NULL; From ddb6cee0d542464cef38c4b6532b4928df8807cc Mon Sep 17 00:00:00 2001 From: River Dillon Date: Sat, 10 Jul 2021 01:34:47 -0700 Subject: [PATCH 301/681] Contribution note --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index be9a32a7c..6be41960c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -194,3 +194,6 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 + +* River Dillon + * [2021-07-10] fix compilation with musl libc From 4f4e286bf67aeb92132f06dd1637e437d3ec759d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 18:20:40 +0200 Subject: [PATCH 302/681] Fix copy-paste error in LIBCORE assignment for Tiger Lake --- cpuid_x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4737b1851..00fc8baa0 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -2164,13 +2164,13 @@ int get_coretype(void){ case 8: if (model == 12) { // Tiger Lake if(support_avx512()) - return CPUTYPE_SKYLAKEX; + return CORE_SKYLAKEX; if(support_avx2()) - return CPUTYPE_HASWELL; + return CORE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; + return CORE_NEHALEM; } if (model == 14) { // Kaby Lake if(support_avx()) From d5110630986c89ee88560b2204b7c157533a979e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 18:52:44 +0200 Subject: [PATCH 303/681] Move Alpine Linux build job from Travis to Azure --- azure-pipelines.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4b6b2b0e6..8bc27eb08 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -147,3 +147,15 @@ jobs: export ANDROID_NDK_HOME=/usr/local/share/android-ndk make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 +- job: ALPINE_MUSL + pool: + vmImage: 'ubuntu-16.04' + steps: + - script | + wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + alpine make DYNAMIC_ARCH=1 BINARY=64 + alpine make DYNAMIC_ARCH=1 BINARY=64 install + From 89429fdaa2a859c5a1e44fc782a20a03b7fa6540 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 19:03:42 +0200 Subject: [PATCH 304/681] fix typo --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8bc27eb08..65bc8e680 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -149,9 +149,9 @@ jobs: - job: ALPINE_MUSL pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - - script | + - script: | wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } From d86290edf0edcc5f931c52dce3955348c40949f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 19:52:04 +0200 Subject: [PATCH 305/681] add sudo for install in Alpine --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 65bc8e680..6a7cc73e4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,5 +157,5 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine make DYNAMIC_ARCH=1 BINARY=64 install + alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install From c9304199cfe6f7aa9d98b4d397e91edaf9a2929c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 20:12:33 +0200 Subject: [PATCH 306/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6a7cc73e4..cf43c0647 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -155,7 +155,7 @@ jobs: wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install From db57c449dc387d68b247ae0fe73bbb178a71118c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 20:57:21 +0200 Subject: [PATCH 307/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cf43c0647..47579aa2a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,5 +157,5 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine sudo make DYNAMIC_ARCH=1 BINARY=64 install + alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install From 14e33e0f7e05e26b2b1cc2ced015c7722b0adc31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 21:27:53 +0200 Subject: [PATCH 308/681] Handle OPENBLAS_LOOPS in SYR2 benchmark --- benchmark/syr2.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/benchmark/syr2.c b/benchmark/syr2.c index acbc86987..61d1036ea 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - blasint m, i, j; + blasint m, i, j, l; blasint inc_x= 1; blasint inc_y= 1; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) { for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); + timeg += getsec(); + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); From 7e09570e04dc715f98bfcbc2c9374707b29f7d94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 22:41:49 +0200 Subject: [PATCH 309/681] Update azure-pipelines.yml --- azure-pipelines.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 47579aa2a..261b6877f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -158,4 +158,8 @@ jobs: sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install - + alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c + alpine echo "#include " >>test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c + alpine gcc -I/opt/OpenBLAS/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + From 0266ba7cb67aa3e31dae140442bf38841207cfe4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Jul 2021 23:21:58 +0200 Subject: [PATCH 310/681] Update azure-pipelines.yml --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 261b6877f..2d7f597c1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -158,6 +158,7 @@ jobs: sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install + alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c From 69560ad3cec3bee4d1dbc7ceeeb9f345f3bfc46c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 07:25:07 +0200 Subject: [PATCH 311/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2d7f597c1..734c50d67 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install + alpine (echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install) alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From a27a61bb9adfc0b7adc36ea1945106feb0e03ccf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 08:24:20 +0200 Subject: [PATCH 312/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 734c50d67..368f4120e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine (echo ""|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install) + alpine bash -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From c47e35acee00eb195175ec926aae7aebd7fa1dc9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 09:38:48 +0200 Subject: [PATCH 313/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 368f4120e..a9bb43da4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine bash -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" + alpine sh -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" alpine ls -l /opt/OpenBLAS/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From 8acb6fe3a86c093f993f97d8be14a98c80d10a2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 11:29:52 +0200 Subject: [PATCH 314/681] Update azure-pipelines.yml --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a9bb43da4..6b4d6fad0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,10 +157,10 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine sh -c "echo ''|sudo -S make DYNAMIC_ARCH=1 BINARY=64 install" - alpine ls -l /opt/OpenBLAS/include + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install" + alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c - alpine gcc -I/opt/OpenBLAS/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + alpine gcc -Imytestdir/include test_install.c -lopenblas -lpthread -lgfortran -o test_install From d2693eac04c568bb7201371603ec3c46f657d1c8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 11:54:02 +0200 Subject: [PATCH 315/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6b4d6fad0..fa37e46a1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -157,7 +157,7 @@ jobs: alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 - alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install" + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c From 836c7fb9f5dc52402dad37ef7db8ff47d3870bda Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 14:37:38 +0200 Subject: [PATCH 316/681] Revert addition of test_install target --- Makefile | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index d31cc9c83..555d1c467 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,9 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest test_install exports benchmark ../laswp ../bench cpp_thread_test +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test -.PHONY : all libs netlib $(RELA) test ctest test_install shared install +.PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test all :: libs netlib $(RELA) tests shared @@ -155,11 +155,6 @@ endif endif endif -test_install : - mkdir -p install - PREFIX=install $(MAKE) install - $(MAKE) -C test_install all - libs : ifeq ($(CORE), UNKNOWN) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) @@ -404,5 +399,4 @@ endif @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) - @rm -rf install @echo Done. From eba2cd951e5851060dfbf1a2843b967b657b393f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 14:38:49 +0200 Subject: [PATCH 317/681] Revert addition of test_install --- test_install/Makefile | 15 --------------- test_install/test_sched_include.c | 5 ----- 2 files changed, 20 deletions(-) delete mode 100644 test_install/Makefile delete mode 100644 test_install/test_sched_include.c diff --git a/test_install/Makefile b/test_install/Makefile deleted file mode 100644 index 29cd81322..000000000 --- a/test_install/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -# -# tests of installed headers and libs -# - -INSTALLDIR = ../install - -.PHONY: all -all: test_sched_include - -test_sched_include: test_sched_include.c - $(CC) -c -I$(INSTALLDIR)/include $< - -.PHONY: clean -clean: - rm -f *.o diff --git a/test_install/test_sched_include.c b/test_install/test_sched_include.c deleted file mode 100644 index aea35680d..000000000 --- a/test_install/test_sched_include.c +++ /dev/null @@ -1,5 +0,0 @@ -// tests that inclusion of openblas_config.h works with musl - -#include - -cpu_set_t* cpu_set = NULL; From 7bb59fceb73431ab06b49f6c0e19a028ef2f82d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 16:00:29 +0200 Subject: [PATCH 318/681] Clean up some warnings --- interface/gemm.c | 2 ++ interface/gemv.c | 2 +- interface/ger.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index cd5d00589..10426fd8f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -126,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB, #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -144,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; diff --git a/interface/gemv.c b/interface/gemv.c index b6c2e6095..1f14cdb2c 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -203,7 +203,7 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); return; } diff --git a/interface/ger.c b/interface/ger.c index 1c72d51ec..af6ae8606 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -165,7 +165,7 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == 0.) return; if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { - GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL); return; } From b4cbfe66775063f55eea58c24446b8e8301fcf16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 18:08:30 +0200 Subject: [PATCH 319/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fa37e46a1..0e806dc91 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -162,5 +162,5 @@ jobs: alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c - alpine gcc -Imytestdir/include test_install.c -lopenblas -lpthread -lgfortran -o test_install + alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install From 498479b13e257dcfbbc5600ad405639f378aaf70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 18:29:17 +0200 Subject: [PATCH 320/681] Update azure-pipelines.yml --- azure-pipelines.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0e806dc91..889b920e3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -161,6 +161,7 @@ jobs: alpine ls -l mytestdir/include alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c alpine echo "#include " >>test_install.c - alpine echo "cpu_set_t* cpu_set = NULL;" >>test_install.c + alpine echo "int main(){" >> test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install From 239ff330f822a3057ff657d11d084cd6e095aa4f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Jul 2021 23:48:39 +0200 Subject: [PATCH 321/681] Update Changelog for 0.3.16 --- Changelog.txt | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 6c5cf573e..8cd101699 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,52 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.16 + 11-Jul-2021 + +common: + - drastically reduced the stack size requirements for running the LAPACK + testsuite (Reference-LAPACK PR 553) + - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK + PR 564) + - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode + - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N + and DGEMV_N, for small input sizes and consecutive arguments + - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes + by disabling multithreading + - fixed installing with BSD versions of the "install" utility + +RISCV: + - fixed the implementation of xIMIN + - improved the performance of DSDOT + - fixed linking of the tests on C910V with current vendor gcc + +POWER: +- fixed SBGEMM computation for some odd value inputs +- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 + +x86_64: + - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus + - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc + versions + - fixed compilation with MS Visual Studio versions older than 2017 + - fixed macro name collision with winnt.h from the latest Win10 SDK + - added cpu type autodetection for Intel Ice Lake SP + - fixed cpu type autodetection for Intel Tiger Lake + - added cpu type autodetection for recent Centaur/Zhaoxin models + - fixed compilation with musl libc + +ARM64: +- fixed compilation with gcc/gfortran on the Apple M1 +- fixed linking of the tests on FreeBSD +- fixed missing restore of a register in the recently rewritten DNRM2 kernel + for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. + DGEEV +- added compiler optimization flags for the EMAG8180 +- added initial support for Cortex A55 + +ARM: +- fixed linking of the tests on FreeBSD + ==================================================================== Version 0.3.15 2-May-2021 From db4908ebfad311a00e5e2168bee1de5bafb94ca3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Jul 2021 00:08:55 +0200 Subject: [PATCH 322/681] Update version to 0.3.16 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 15f6ba2c2..5810f7965 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 15) +set(OpenBLAS_PATCH_VERSION 16) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 4c81d1c3fe3f10872a2d0dede97a963406b1574c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Jul 2021 00:09:35 +0200 Subject: [PATCH 323/681] Update version to 0.3.16 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 0c138331e..19dd32919 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.15 +VERSION = 0.3.16 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 649213179297ee2e2edaf21ad5cc557e65da0bb0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Jul 2021 00:16:29 +0200 Subject: [PATCH 324/681] Update version to 0.3.16.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5810f7965..4fffbd39e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 16) +set(OpenBLAS_PATCH_VERSION 16.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 44cc7cdeccb77a2bc928401f1526d787c3b8fd6c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Jul 2021 00:16:59 +0200 Subject: [PATCH 325/681] Update version to 0.3.16.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 19dd32919..bdc9b69e4 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.16 +VERSION = 0.3.16.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 0fca36c8c3c8bcadd8d084294cd84bf16376f0b9 Mon Sep 17 00:00:00 2001 From: JonasZhou Date: Mon, 12 Jul 2021 13:43:45 +0800 Subject: [PATCH 326/681] Add cpu detection support for Zhaoxin processors Signed-off-by: JonasZhou --- cpuid.h | 1 + cpuid_x86.c | 44 +++++++++++++++++++++++++++-------------- driver/others/dynamic.c | 18 ++++++++++++----- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/cpuid.h b/cpuid.h index 824e0bc70..2c43922e7 100644 --- a/cpuid.h +++ b/cpuid.h @@ -54,6 +54,7 @@ #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 #define VENDOR_HYGON 11 +#define VENDOR_ZHAOXIN 12 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) diff --git a/cpuid_x86.c b/cpuid_x86.c index 00fc8baa0..5aa49055a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -283,7 +283,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; @@ -1067,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_HYGON) || - (get_vendor() == VENDOR_CENTAUR)) { + (get_vendor() == VENDOR_CENTAUR) || + (get_vendor() == VENDOR_ZHAOXIN)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1190,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ int get_cpuname(void){ - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; if (!have_cpuid()) return CPUTYPE_80386; @@ -1198,6 +1199,7 @@ int get_cpuname(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -1628,15 +1630,20 @@ int get_cpuname(void){ switch (family) { case 0x5: return CPUTYPE_CENTAURC6; - break; case 0x6: - return CPUTYPE_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return CPUTYPE_NANO; return CPUTYPE_NEHALEM; - break; + default: + if (family >= 0x7) + return CPUTYPE_NEHALEM; + else + return CPUTYPE_VIAC3; } - return CPUTYPE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN){ + return CPUTYPE_NEHALEM; } if (vendor == VENDOR_RISE){ @@ -1869,7 +1876,7 @@ char *get_lower_cpunamechar(void){ int get_coretype(void){ - int family, exfamily, model, exmodel, vendor; + int family, exfamily, model, exmodel, vendor, stepping; if (!have_cpuid()) return CORE_80486; @@ -1877,6 +1884,7 @@ int get_coretype(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -2286,13 +2294,19 @@ int get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return CORE_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return CORE_NANO; return CORE_NEHALEM; - break; + default: + if (family >= 0x7) + return CORE_NEHALEM; + else + return CORE_VIAC3; } - return CORE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN) { + return CORE_NEHALEM; } return CORE_UNKNOWN; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1a33870db..071788a9b 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 #define VENDOR_HYGON 4 +#define VENDOR_ZHAOXIN 5 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -404,7 +405,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -415,7 +416,7 @@ static int get_vendor(void){ static gotoblas_t *get_coretype(void){ int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -423,6 +424,7 @@ static gotoblas_t *get_coretype(void){ exfamily = BITMASK(eax, 20, 0xff); model = BITMASK(eax, 4, 0x0f); exmodel = BITMASK(eax, 16, 0x0f); + stepping = BITMASK(eax, 0, 0x0f); vendor = get_vendor(); @@ -824,13 +826,19 @@ static gotoblas_t *get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return &gotoblas_NANO; - break; - case 0x7: + if (model == 0xf && stepping < 0xe) + return &gotoblas_NANO; return &gotoblas_NEHALEM; + default: + if (family >= 0x7) + return &gotoblas_NEHALEM; } } + if (vendor == VENDOR_ZHAOXIN) { + return &gotoblas_NEHALEM; + } + return NULL; } From 029d1e16b9a55e77872dac4fab87692b8c7dc2d0 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Wed, 14 Jul 2021 12:20:57 +0200 Subject: [PATCH 327/681] Avoid redefinition of _GNU_SOURCE * _GNU_SOURCE may have been set by the application and redefinition trigger warnings or error with -Werror * Fix for 220f6a1c5 --- openblas_config_template.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openblas_config_template.h b/openblas_config_template.h index 1e17c9a16..6a7382108 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,6 +99,8 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX -#define _GNU_SOURCE +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif #include #endif From 1dea57ab255c0dbb60228965b8a3249f8f5294e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 20:32:57 +0200 Subject: [PATCH 328/681] Revert PR #3250 (shortcut without buffer allocation) as it is unsafe on some x86_64 --- interface/gemv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 1f14cdb2c..1f0763579 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -201,12 +201,14 @@ void CNAME(enum CBLAS_ORDER order, if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; - + +#if 0 +/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); return; } - +#endif IDEBUG_START; FUNCTION_PROFILE_START(); From 5b4b385ecfe9453951f78d4f4efd51518af25bc5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 20:50:14 +0200 Subject: [PATCH 329/681] Temporarily disable the SkylakeX sgemv_t microkernel due to LAPACK testsuite failures --- kernel/x86_64/sgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index a36c8ace9..76236cd16 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" -#include "sgemv_t_microk_skylakex.c" +/*#include "sgemv_t_microk_skylakex.c"*/ #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) From a6351e32f0d41056e6d2b6631fb4e5d6655be5c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 21:09:36 +0200 Subject: [PATCH 330/681] Remove BLASLONG casts from SPARC entries in response to https://github.com/xianyi/OpenBLAS/pull/3266#issuecomment-878637675 --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 01048023f..965b97466 100644 --- a/param.h +++ b/param.h @@ -2502,7 +2502,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2534,7 +2534,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 2048 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL +#define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 From fe0e66564ecab9627ba9313ab7c116b586b7cf19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:39:20 +0200 Subject: [PATCH 331/681] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/cchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index de4aed696..ab54078a3 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1075,7 +1075,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From 2b9443b7e78aa4b5f77e5d4d4cb03205bcdd52fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:40:29 +0200 Subject: [PATCH 332/681] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/dchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index 00e8eb57f..6399fecef 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1081,7 +1081,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From f4d4abd423ecf998faa70e09847fd99cdac8888a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:41:45 +0200 Subject: [PATCH 333/681] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/schkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index c3f9ca162..5484a7c26 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1081,7 +1081,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From f176ff90af6b1d16f940575ea2f03edc13e5f444 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Jul 2021 22:42:43 +0200 Subject: [PATCH 334/681] Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp --- lapack-netlib/TESTING/EIG/zchkee.F | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 908b7d651..7e9144d15 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1075,7 +1075,8 @@ CHARACTER*80 LINE INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, - $ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS + $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. From da8435dc362a38a71925483081a5694f13240b1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 14:44:17 +0200 Subject: [PATCH 335/681] Update Changelog for 0.3.17 --- Changelog.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 8cd101699..ee0484e2b 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,5 +1,20 @@ OpenBLAS ChangeLog ==================================================================== +Version 0.3.17 + 15-Jul-2021 + +common: + - reverted the optimization of SGEMV_N/DGEMV_N for small input sizes + and consecutive arguments as it led to stack overflows on x86_64 + with some operating systems (notably OSX and Windows) + + x86_64: + - reverted the performance patch for SGEMV_T on AVX512 as it caused + wrong results in some applications + + SPARC: + - fixed compilation with compilers other than gcc +==================================================================== Version 0.3.16 11-Jul-2021 From 4777eb678faccd7af014289966898009940731a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 14:46:24 +0200 Subject: [PATCH 336/681] Update version to 0.3.17 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5810f7965..37191a42b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 16) +set(OpenBLAS_PATCH_VERSION 17) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From d909f9f3d4fc4ccff36d69f178558df154ba1002 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 14:52:54 +0200 Subject: [PATCH 337/681] Update version to 0.3.17 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index bdc9b69e4..2e0980fa9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.16.dev +VERSION = 0.3.17 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 6e3fbe8ac5a405149ebd6acaad6a4c88d3e07215 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 14:59:15 +0200 Subject: [PATCH 338/681] Update version to 0.3.17.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37191a42b..0330b2ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 17) +set(OpenBLAS_PATCH_VERSION 17.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 38d5b4b1241f60ab533f136b2d8e61eef1f5062e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Jul 2021 15:00:01 +0200 Subject: [PATCH 339/681] Update version to 0.3.17.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 2e0980fa9..7c04a3101 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.17 +VERSION = 0.3.17.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 49bbf330ca592f439a07f24f137e61af1cc9c616 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 18 Jul 2021 22:19:19 +0200 Subject: [PATCH 340/681] Empirical workaround for numpy SVD NaN problem from issue 3318 --- kernel/Makefile.L2 | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 888a9b959..ac53c29c3 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -1,3 +1,10 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + ### GEMV ### ifndef SGEMVNKERNEL @@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) - $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ endif $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) From 30f23be0f94c7041b7e3bb53a4a0236355cdabad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jul 2021 12:00:03 +0200 Subject: [PATCH 341/681] Rework setting of -mfma to only apply it where necessary --- cmake/cc.cmake | 6 +++--- cmake/system.cmake | 10 +++++----- cmake/utils.cmake | 10 +++++++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 76952152b..ac5e455d5 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () - if (HAVE_FMA3) - set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") - endif () + # if (HAVE_FMA3) + #set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + #endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index 34874827c..f8bd6678e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -186,11 +186,11 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() - if (DEFINED HAVE_FMA3) - if (NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") - endif() - endif() + # if (DEFINED HAVE_FMA3) + # if (NOT NO_AVX2) + # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + # endif() + # endif() if (DEFINED HAVE_SSE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 794d73d06..2c1a1c763 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in) configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) file(REMOVE ${new_source_file}.tmp) list(APPEND SRC_LIST_OUT ${new_source_file}) - + message (STATUS ${new_source_file}) + if (DEFINED HAVE_FMA3) + if ( ${new_source_file} MATCHES "(s|d?)rot_k.c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + if ( ${new_source_file} MATCHES "dgemv_t_k.c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + endif () endforeach () endforeach () From 47ba85f314808476c8254779389607f9af60231f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 22 Jul 2021 17:24:15 +0200 Subject: [PATCH 342/681] Fix regex to match kernels suffixed with cpuname too --- cmake/utils.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 2c1a1c763..6b54092ea 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -313,10 +313,10 @@ function(GenerateNamedObjects sources_in) list(APPEND SRC_LIST_OUT ${new_source_file}) message (STATUS ${new_source_file}) if (DEFINED HAVE_FMA3) - if ( ${new_source_file} MATCHES "(s|d?)rot_k.c") + if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c") set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") endif () - if ( ${new_source_file} MATCHES "dgemv_t_k.c") + if ( ${new_source_file} MATCHES "dgemv_t_k.*c") set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") endif () endif () From efbd7c7840f01f6479fb0224ff473c3166eee669 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 Jul 2021 13:42:52 +0200 Subject: [PATCH 343/681] GCC did not support -mtune for ARM64 before 5.1 --- Makefile.arm64 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index c23a0876e..2656a17f9 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,4 +1,15 @@ ifneq ($(C_COMPILER), PGI) + +ifneq ($(GCCVERSIONGT4), 1) +CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a +endif + + +else + + ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a ifneq ($(F_COMPILER), NAG) @@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag endif endif endif + endif + +endif \ No newline at end of file From af0a69f355a086d70cc08ccda8bde7a48b3133c4 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 26 Jul 2021 15:44:54 +0800 Subject: [PATCH 344/681] Add support for LOONGARCH64 --- Makefile.loongarch64 | 3 + Makefile.system | 12 + TargetList.txt | 2 + c_check | 53 +- common.h | 6 +- common_loongarch64.h | 199 ++ common_macro.h | 3 +- cpuid_loongarch64.c | 110 + ctest.c | 4 + getarch.c | 24 +- kernel/loongarch64/KERNEL | 236 ++ kernel/loongarch64/KERNEL.LOONGSON3R5 | 1 + kernel/loongarch64/KERNEL.generic | 167 ++ kernel/loongarch64/Makefile | 1 + kernel/loongarch64/amax.S | 230 ++ kernel/loongarch64/amin.S | 186 ++ kernel/loongarch64/asum.S | 232 ++ kernel/loongarch64/cnrm2.S | 159 ++ kernel/loongarch64/copy.S | 225 ++ kernel/loongarch64/dnrm2.S | 314 +++ kernel/loongarch64/dot.S | 391 ++++ kernel/loongarch64/gemm_kernel.S | 1859 ++++++++++++++++ kernel/loongarch64/gemv_n.S | 531 +++++ kernel/loongarch64/gemv_t.S | 436 ++++ kernel/loongarch64/iamax.S | 233 ++ kernel/loongarch64/iamin.S | 233 ++ kernel/loongarch64/izamax.S | 217 ++ kernel/loongarch64/izamin.S | 217 ++ kernel/loongarch64/max.S | 174 ++ kernel/loongarch64/min.S | 174 ++ kernel/loongarch64/scal.S | 330 +++ kernel/loongarch64/snrm2.S | 249 +++ kernel/loongarch64/swap.S | 330 +++ kernel/loongarch64/trsm_kernel_LN.S | 2863 +++++++++++++++++++++++++ kernel/loongarch64/trsm_kernel_LT.S | 2854 ++++++++++++++++++++++++ kernel/loongarch64/trsm_kernel_RT.S | 2850 ++++++++++++++++++++++++ kernel/loongarch64/zamax.S | 190 ++ kernel/loongarch64/zamin.S | 198 ++ kernel/loongarch64/zasum.S | 158 ++ kernel/loongarch64/zcopy.S | 217 ++ kernel/loongarch64/zdot.S | 330 +++ kernel/loongarch64/zgemm3m_kernel.S | 1359 ++++++++++++ kernel/loongarch64/zgemm_kernel.S | 1047 +++++++++ kernel/loongarch64/zgemv_n.S | 648 ++++++ kernel/loongarch64/zgemv_t.S | 556 +++++ kernel/loongarch64/znrm2.S | 304 +++ kernel/loongarch64/zscal.S | 356 +++ kernel/loongarch64/ztrsm_kernel_LT.S | 1344 ++++++++++++ kernel/loongarch64/ztrsm_kernel_RT.S | 1343 ++++++++++++ lapack/laswp/loongarch64/Makefile | 12 + param.h | 46 + 51 files changed, 24189 insertions(+), 27 deletions(-) create mode 100644 Makefile.loongarch64 create mode 100644 common_loongarch64.h create mode 100644 cpuid_loongarch64.c create mode 100644 kernel/loongarch64/KERNEL create mode 100644 kernel/loongarch64/KERNEL.LOONGSON3R5 create mode 100644 kernel/loongarch64/KERNEL.generic create mode 100644 kernel/loongarch64/Makefile create mode 100644 kernel/loongarch64/amax.S create mode 100644 kernel/loongarch64/amin.S create mode 100644 kernel/loongarch64/asum.S create mode 100644 kernel/loongarch64/cnrm2.S create mode 100644 kernel/loongarch64/copy.S create mode 100644 kernel/loongarch64/dnrm2.S create mode 100644 kernel/loongarch64/dot.S create mode 100644 kernel/loongarch64/gemm_kernel.S create mode 100644 kernel/loongarch64/gemv_n.S create mode 100644 kernel/loongarch64/gemv_t.S create mode 100644 kernel/loongarch64/iamax.S create mode 100644 kernel/loongarch64/iamin.S create mode 100644 kernel/loongarch64/izamax.S create mode 100644 kernel/loongarch64/izamin.S create mode 100644 kernel/loongarch64/max.S create mode 100644 kernel/loongarch64/min.S create mode 100644 kernel/loongarch64/scal.S create mode 100644 kernel/loongarch64/snrm2.S create mode 100644 kernel/loongarch64/swap.S create mode 100644 kernel/loongarch64/trsm_kernel_LN.S create mode 100644 kernel/loongarch64/trsm_kernel_LT.S create mode 100644 kernel/loongarch64/trsm_kernel_RT.S create mode 100644 kernel/loongarch64/zamax.S create mode 100644 kernel/loongarch64/zamin.S create mode 100644 kernel/loongarch64/zasum.S create mode 100644 kernel/loongarch64/zcopy.S create mode 100644 kernel/loongarch64/zdot.S create mode 100644 kernel/loongarch64/zgemm3m_kernel.S create mode 100644 kernel/loongarch64/zgemm_kernel.S create mode 100644 kernel/loongarch64/zgemv_n.S create mode 100644 kernel/loongarch64/zgemv_t.S create mode 100644 kernel/loongarch64/znrm2.S create mode 100644 kernel/loongarch64/zscal.S create mode 100644 kernel/loongarch64/ztrsm_kernel_LT.S create mode 100644 kernel/loongarch64/ztrsm_kernel_RT.S create mode 100644 lapack/laswp/loongarch64/Makefile diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.loongarch64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.system b/Makefile.system index bb8c60e91..4084390db 100644 --- a/Makefile.system +++ b/Makefile.system @@ -780,6 +780,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings @@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +ifeq ($(CORE), LOONGSONG3R5) +CCOMMON_OPT += -march=loongarch64 -mabi=lp64 +FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +endif +endif + endif ifndef BINARY_DEFINED diff --git a/TargetList.txt b/TargetList.txt index f93a629d8..963545cdd 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -110,3 +110,5 @@ Z14 RISCV64_GENERIC C910V +11.LOONGARCH64: +LOONGSON3R5 diff --git a/c_check b/c_check index e24943a29..030f5e632 100644 --- a/c_check +++ b/c_check @@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); $os = Haiku if ($data =~ /OS_HAIKU/); -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); -$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $defined = 0; @@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { $binary = 64; } +if ($architecture eq "loongarch64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/common.h b/common.h index ac795937c..ff5254a5c 100644 --- a/common.h +++ b/common.h @@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif - + #ifdef ARCH_RISCV64 #include "common_riscv64.h" #endif @@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_zarch.h" #endif +#ifdef ARCH_LOONGARCH64 +#include "common_loongarch64.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_loongarch64.h b/common_loongarch64.h new file mode 100644 index 000000000..959e7e58a --- /dev/null +++ b/common_loongarch64.h @@ -0,0 +1,199 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LOONGARCH64 +#define COMMON_LOONGARCH64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#else + +#ifdef DOUBLE +#define LD fld.d +#define ST fst.d +#define MADD fmadd.d +#define NMADD fnmadd.d +#define MSUB fmsub.d +#define NMSUB fnmsub.d +#define ADD fadd.d +#define SUB fsub.d +#define MUL fmul.d +#define MOV fmov.d +#define CMOVT fsel +#define MTC movgr2fr.d +#define FABS fabs.d +#define CMPEQ fcmp.ceq.d +#define CMPLE fcmp.cle.d +#define CMPLT fcmp.clt.d +#define NEG fneg.d +#else +#define LD fld.s +#define ST fst.s +#define MADD fmadd.s +#define NMADD fnmadd.s +#define MSUB fmsub.s +#define NMSUB fnmsub.s +#define ADD fadd.s +#define SUB fsub.s +#define MUL fmul.s +#define MOV fmov.s +#define CMOVT fsel +#define MTC movgr2fr.w +#define FABS fabs.s +#define CMPEQ fcmp.ceq.s +#define CMPLE fcmp.cle.s +#define CMPLT fcmp.clt.s +#define NEG fneg.s +#endif /* defined(DOUBLE) */ + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld.d +#define LDARG ld.d +#define SDARG st.d +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT ld.w +#define LDARG ld.d +#define SDARG st.d +#else +#define LDINT ld.w +#define LDARG ld.w +#define SDARG st.w +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif /* defined(F_INTERFACE) */ + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .align 5 ;\ + .globl REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + +#if defined(__linux__) && defined(__ELF__) +#define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +#define GNUSTACK +#endif /* defined(__linux__) && defined(__ELF__) */ + +#define EPILOGUE \ + .end REALNAME ;\ + GNUSTACK + +#define PROFCODE + +#define MOVT(dst, src, cc) \ + bceqz cc, 1f; \ + add.d dst, src, $r0; \ + 1: + +#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ + +#endif /* defined(ASSEMBLER) */ + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 32 << 20) + +#define PAGESIZE (16UL << 1) +#define FIXED_PAGESIZE (16UL << 10) +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_macro.h b/common_macro.h index c6ea1bfd9..0136f18ab 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2490,7 +2490,8 @@ #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ +|| defined(ARCH_LOONGARCH64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c new file mode 100644 index 000000000..79b186bf1 --- /dev/null +++ b/cpuid_loongarch64.c @@ -0,0 +1,110 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_LOONGSON3R5 1 + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_LASX 1<<7 + +static char *cpuname[] = { + "UNKNOWN", + "LOONGSON3R5" +}; + +int detect(void) { + uint32_t reg = 0; + + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LOONGARCH_LASX) + return CPU_LOONGSON3R5; + else + return CPU_UNKNOWN; +} + +char *get_corename(void) { + return cpuname[detect()]; +} + +void get_architecture(void) { + printf("LOONGARCH64"); +} + +void get_subarchitecture(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("LOONGSON3R5"); + } else { + printf("UNKNOWN"); + } +} + +void get_subdirname(void) { + printf("loongarch64"); +} + +void get_cpuconfig(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } else { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } +} + +void get_libname(void){ + if (detect() == CPU_LOONGSON3R5) { + printf("loongson3r5\n"); + } else { + printf("loongarch64\n"); + } +} diff --git a/ctest.c b/ctest.c index d674a8cbd..4f18918f5 100644 --- a/ctest.c +++ b/ctest.c @@ -157,6 +157,10 @@ ARCH_ARM64 ARCH_RISCV64 #endif +#ifdef __loongarch64 +ARCH_LOONGARCH64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif diff --git a/getarch.c b/getarch.c index 3bc8a0c3d..6e43616f7 100644 --- a/getarch.c +++ b/getarch.c @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SICORTEX */ /* #define FORCE_LOONGSON3R3 */ /* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R5 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_LOONGSON3R5 +#define FORCE +#define ARCHITECTURE "LOONGARCH" +#define SUBARCHITECTURE "LOONGSON3R5" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLOONGSON3R5 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "loongson3r5" +#define CORENAME "LOONGSON3R5" +#else +#endif + #ifdef FORCE_I6400 #define FORCE #define ARCHITECTURE "MIPS" @@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __loongarch64 +#include "cpuid_loongarch64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifdef __riscv #include "cpuid_riscv64.c" #define OPENBLAS_SUPPORTED @@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL new file mode 100644 index 000000000..e96a90e72 --- /dev/null +++ b/kernel/loongarch64/KERNEL @@ -0,0 +1,236 @@ +ifndef SAXPYKERNEL +SAXPYKERNEL = ../arm/axpy.c +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = ../arm/axpy.c +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = ../arm/zaxpy.c +endif + +ifndef SROTKERNEL +SROTKERNEL = ../arm/rot.c +endif + +ifndef DROTKERNEL +DROTKERNEL = ../arm/rot.c +endif + +ifndef CROTKERNEL +CROTKERNEL = ../arm/zrot.c +endif + +ifndef ZROTKERNEL +ZROTKERNEL = ../arm/zrot.c +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = ../arm/zswap.c +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = ../arm/zswap.c +endif + +ifndef SSUMKERNEL +SSUMKERNEL = ../arm/sum.c +endif + +ifndef DSUMKERNEL +DSUMKERNEL = ../arm/sum.c +endif + +ifndef CSUMKERNEL +CSUMKERNEL = ../arm/zsum.c +endif + +ifndef ZSUMKERNEL +ZSUMKERNEL = ../arm/zsum.c +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = ../arm/imax.c +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = ../arm/imax.c +endif + +ifndef ISMINKERNEL +ISMINKERNEL = ../arm/imin.c +endif + +ifndef IDMINKERNEL +IDMINKERNEL = ../arm/imin.c +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMMKERNEL +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +endif + +ifndef DGEMMKERNEL +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +endif + +ifndef CGEMMKERNEL +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +endif + +ifndef ZGEMMKERNEL +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef DTRSMKERNEL_LN +DTRSMKERNEL_LN = trsm_kernel_LN.S +endif + +ifndef DTRSMKERNEL_LT +DTRSMKERNEL_LT = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RN +DTRSMKERNEL_RN = trsm_kernel_LT.S +endif + +ifndef DTRSMKERNEL_RT +DTRSMKERNEL_RT = trsm_kernel_RT.S +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef ZTRSMKERNEL_LN +ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_LT +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RN +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +endif + +ifndef ZTRSMKERNEL_RT +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S +endif + +ifndef CGEMM3MKERNEL +CGEMM3MKERNEL = zgemm3m_kernel.S +endif + +ifndef ZGEMM3MKERNEL +ZGEMM3MKERNEL = zgemm3m_kernel.S +endif diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 new file mode 100644 index 000000000..cce4093e3 --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -0,0 +1 @@ +#TODO: Add loongarch64 SIMD optimizations diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic new file mode 100644 index 000000000..105b2f6fd --- /dev/null +++ b/kernel/loongarch64/KERNEL.generic @@ -0,0 +1,167 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../generic/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/loongarch64/Makefile b/kernel/loongarch64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/loongarch64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/loongarch64/amax.S b/kernel/loongarch64/amax.S new file mode 100644 index 000000000..4b135c522 --- /dev/null +++ b/kernel/loongarch64/amax.S @@ -0,0 +1,230 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 + +#define I $r17 +#define TEMP $r18 + +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + + LD a1, X, 0 * SIZE + addi.d N, N, -1 + + add.d X, X, INCX + FABS s1, a1 + + FABS s2, a1 + bge $r0, N, .L999 + + FABS s3, a1 + srai.d I, N, 3 + + FABS s4, a1 + bge $r0, I, .L15 + + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, s1, t1, $fcc0 + + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/amin.S b/kernel/loongarch64/amin.S new file mode 100644 index 000000000..ff9978f26 --- /dev/null +++ b/kernel/loongarch64/amin.S @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 +.L15: + andi I, N, 7 +NOP + bge $r0, I, .L998 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S new file mode 100644 index 000000000..e4c717085 --- /dev/null +++ b/kernel/loongarch64/asum.S @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + PROLOGUE +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + FABS t1, a1 + LD a6, X, 5 * SIZE + FABS t2, a2 + LD a7, X, 6 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + ADD s1, s1, t1 + LD a1, X, 8 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 9 * SIZE + FABS t2, a6 + NOP + ADD s1, s1, t3 + LD a3, X, 10 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 11 * SIZE + FABS t4, a8 + addi.d X, X, 8 * SIZE + ADD s1, s1, t1 + LD a5, X, 4 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 5 * SIZE + FABS t2, a2 + NOP + ADD s1, s1, t3 + LD a7, X, 6 * SIZE + FABS t3, a3 + NOP + ADD s2, s2, t4 + LD a8, X, 7 * SIZE + FABS t4, a4 + blt $r0, I, .L12 + .align 3 +.L13: + ADD s1, s1, t1 + addi.d X, X, 8 * SIZE + FABS t1, a5 + NOP + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + ADD s1, s1, t1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + LD a7, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + add.d X, X, INCX + ADD s2, s2, t2 + LD a2, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + add.d X, X, INCX + ADD s2, s2, t4 + LD a4, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + add.d X, X, INCX + ADD s2, s2, t2 + LD a6, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + add.d X, X, INCX + ADD s2, s2, t4 + LD a8, X, 0 * SIZE + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + add.d X, X, INCX + ADD s1, s1, t1 + blt $r0, I, .L26 + .align 3 +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S new file mode 100644 index 000000000..c4b2555d3 --- /dev/null +++ b/kernel/loongarch64/cnrm2.S @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, 2 * SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 2 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + fcvt.d.s t1, a1 + LD a7, X, 0 * SIZE + fcvt.d.s t2, a2 + LD a8, X, 1 * SIZE + fcvt.d.s t3, a3 + addi.d I, I, -1 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + LD a2, X, 1 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + LD a4, X, 1 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + addi.d I, I, -1 + fmadd.d s2, t2, t2, s2 + LD a6, X, 1 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 1 * SIZE + fmadd.d s2, t4, t4, s2 + add.d X, X, INCX + fcvt.d.s t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fmadd.d s1, t1, t1, s1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S new file mode 100644 index 000000000..28b7bce4c --- /dev/null +++ b/kernel/loongarch64/copy.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, SIZE + NOP + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, Y, 0 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S new file mode 100644 index 000000000..41db48bdf --- /dev/null +++ b/kernel/loongarch64/dnrm2.S @@ -0,0 +1,314 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + NOP + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + addi.d N, N, 1 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 3 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + add.d XX, XX, INCX + LD a2, XX, 0 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + add.d XX, XX, INCX + LD a4, XX, 0 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + add.d XX, XX, INCX + LD a6, XX, 0 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + add.d XX, XX, INCX + LD a8, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + add.d XX, XX, INCX + MUL t3, ALPHA, a3 + LD a2, XX, 0 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a4, XX, 0 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + add.d XX, XX, INCX + MUL t3, ALPHA, a7 + LD a6, XX, 0 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a8, XX, 0 * SIZE + MADD s4, t4, t4, s4 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S new file mode 100644 index 000000000..4fcd569c8 --- /dev/null +++ b/kernel/loongarch64/dot.S @@ -0,0 +1,391 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define b1 $f12 +#define b2 $f13 +#define b3 $f14 +#define b4 $f15 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + addi.d I, I, -1 + LD b4, Y, 3 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 8 * SIZE + LD b1, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 9 * SIZE + LD b2, Y, 9 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 10 * SIZE + LD b3, Y, 10 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 11 * SIZE + LD b4, Y, 11 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 +.L13: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d X, X, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + addi.d Y, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S new file mode 100644 index 000000000..8926bf123 --- /dev/null +++ b/kernel/loongarch64/gemm_kernel.S @@ -0,0 +1,1859 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r30 +#define PREFETCHSIZE (4 * 10) +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define BB $r29 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r20 +#define TEMP $r16 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -160 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 96 + fst.d $f24, $sp, 56 + fst.d $f25, $sp, 64 + fst.d $f26, $sp, 72 + fst.d $f27, $sp, 80 + fst.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 120 + fst.d $f19, $sp, 128 + fst.d $f20, $sp, 136 + fst.d $f21, $sp, 144 +#endif + slli.d LDC, LDC, BASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC + slli.d BB, K, 2 + BASE_SHIFT + add.d BB, B, BB +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + preld 1, CO1, 3 * SIZE + preld 1, CO2, 3 * SIZE + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + preld 1, CO3, 2 * SIZE + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + preld 1, CO4, 3 * SIZE + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + preld 1, CO5, 3 * SIZE + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + preld 1, CO6, 3 * SIZE + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + preld 1, CO7, 3 * SIZE + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + preld 1, CO8, 3 * SIZE + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + addi.d CO5,CO5, 2 * SIZE + LD $f11, CO3, -1 * SIZE + addi.d CO6,CO6, 2 * SIZE + LD $f12, CO4, -2 * SIZE + addi.d CO7,CO7, 2 * SIZE + LD $f13, CO4, -1 * SIZE + addi.d I, I, -1 + MADD c11, c11, ALPHA, $f22 + LD $f22, CO5, -2 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f8, CO5, -1 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f23, CO6, -2 * SIZE + MADD c22, c22, ALPHA, $f9 + LD $f9, CO6, -1 * SIZE + MADD c31, c31, ALPHA, $f10 + LD $f10, CO7, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + LD $f11, CO7, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + LD $f12, CO8, 0 * SIZE + MADD c42, c42, ALPHA, $f13 + LD $f13, CO8, 1 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + ST c11, CO1, -2 * SIZE + MTC c11, $r0 + ST c12, CO1, -1 * SIZE + addi.d CO8,CO8, 2 * SIZE + ST c21, CO2, -2 * SIZE + MOV c21, c11 + ST c22, CO2, -1 * SIZE + addi.d BB, BB, 16 * SIZE + MADD c51, c51, ALPHA, $f22 + ST c31, CO3, -2 * SIZE + MADD c52, c52, ALPHA, $f8 + ST c32, CO3, -1 * SIZE + MADD c61, c61, ALPHA, $f23 + ST c41, CO4, -2 * SIZE + MADD c62, c62, ALPHA, $f9 + ST c42, CO4, -1 * SIZE + MADD c71, c71, ALPHA, $f10 + ST c51, CO5, -2 * SIZE + MADD c72, c72, ALPHA, $f11 + ST c52, CO5, -1 * SIZE + MADD c81, c81, ALPHA, $f12 + ST c61, CO6, -2 * SIZE + MADD c82, c82, ALPHA, $f13 + ST c62, CO6, -1 * SIZE + ST c71, CO7, -2 * SIZE + MOV c31, c11 + ST c72, CO7, -1 * SIZE + MOV c41, c11 + ST c81, CO8, -2 * SIZE + MOV c51, c11 + ST c82, CO8, -1 * SIZE +MOV c61, c11 + blt $r0, I, .L11 +#else + addi.d CO4,CO4, 2 * SIZE + addi.d CO5,CO5, 2 * SIZE + addi.d CO6,CO6, 2 * SIZE + addi.d CO7,CO7, 2 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + MUL c11, ALPHA, c11 + addi.d CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC a1, $r0 + MUL c21, ALPHA, c21 + addi.d CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO3,CO3, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MUL c51, ALPHA, c51 + ST c32, CO3, -1 * SIZE + MUL c52, ALPHA, c52 + ST c41, CO4, -2 * SIZE + MUL c61, ALPHA, c61 + ST c42, CO4, -1 * SIZE + MUL c62, ALPHA, c62 + ST c51, CO5, -2 * SIZE + MUL c71, ALPHA, c71 + ST c52, CO5, -1 * SIZE + MUL c72, ALPHA, c72 + ST c61, CO6, -2 * SIZE + MUL c81, ALPHA, c81 + ST c62, CO6, -1 * SIZE + MUL c82, ALPHA, c82 + ST c71, CO7, -2 * SIZE + MOV c11, a1 + ST c72, CO7, -1 * SIZE + MOV c21, a1 + addi.d CO8,CO8, 2 * SIZE + addi.d BB, BB, 16 * SIZE + ST c81, CO8, -2 * SIZE + MOV c31, a1 + ST c82, CO8, -1 * SIZE + MOV c41, a1 + addi.d I, I, -1 + MOV c51, a1 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +MOV c61, a1 + blt $r0, I, .L11 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 +MOV c81, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f10, CO5, 0 * SIZE + MADD c21, c21, ALPHA, $f8 + LD $f11, CO6, 0 * SIZE + MADD c31, c31, ALPHA, $f23 + LD $f12, CO7, 0 * SIZE + MADD c41, c41, ALPHA, $f9 + LD $f13, CO8, 0 * SIZE + MADD c51, c51, ALPHA, $f10 + ST c11, CO1, 0 * SIZE + MADD c61, c61, ALPHA, $f11 + ST c21, CO2, 0 * SIZE + MADD c71, c71, ALPHA, $f12 + ST c31, CO3, 0 * SIZE + MADD c81, c81, ALPHA, $f13 + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + MUL c51, ALPHA, c51 + ST c21, CO2, 0 * SIZE + MUL c61, ALPHA, c61 + ST c31, CO3, 0 * SIZE + MUL c71, ALPHA, c71 + ST c41, CO4, 0 * SIZE + MUL c81, ALPHA, c81 + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 8 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f11, CO3, -1 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f12, CO4, -2 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f13, CO4, -1 * SIZE + MADD c22, c22, ALPHA, $f9 + MADD c31, c31, ALPHA, $f10 + ST c11, CO1, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + ST c12, CO1, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + ST c21, CO2, -2 * SIZE + MADD c42, c42, ALPHA, $f13 + ST c22, CO2, -1 * SIZE + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + addi.d CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + addi.d CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + addi.d CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO2,CO2, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +#endif +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + MADD c31, c31, ALPHA, $f23 + MADD c41, c41, ALPHA, $f9 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L55 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + MADD c21, c21, ALPHA, $f23 + MADD c22, c22, ALPHA, $f9 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE + blt $r0, I, .L51 +#else + addi.d I, I, -1 + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L51 +#endif + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L65 +#else + srai.d L, K, 2 + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L75 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + ADD c11, c11, c21 + ADD c12, c12, c22 + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + blt $r0, I, .L71 +#else + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L71 +#endif + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + ADD c11, c11, c21 + MADD c11, c11, ALPHA, $f22 + ST c11, CO1, 0 * SIZE +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + ST c11, CO1, 0 * SIZE +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 96 + fld.d $f24, $sp, 56 + fld.d $f25, $sp, 64 + fld.d $f26, $sp, 72 + fld.d $f27, $sp, 80 + fld.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 120 + fld.d $f19, $sp, 128 + fld.d $f20, $sp, 136 + fld.d $f21, $sp, 144 +#endif + addi.d $sp, $sp, 160 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S new file mode 100644 index 000000000..334a2991f --- /dev/null +++ b/kernel/loongarch64/gemv_n.S @@ -0,0 +1,531 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define y1 $f16 +#define y2 $f17 +#define y3 $f3 +#define y4 $f1 +#define y5 $f2 +#define y6 $f4 +#define y7 $f5 +#define y8 $f6 +#define t1 $f7 +#define t2 $f18 +#define t3 $f19 +#define t4 $f20 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -48 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 + fst.d $f20, $sp, 32 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + LD a2, XX, 0 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + add.d XX, XX, INCY + LD a4, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + add.d X, X, INCX + LD x2, X, 0 * SIZE + add.d X, X, INCX + move AO1, A + add.d AO2, A, LDA + add.d A, AO2, LDA + move YY, YORIG + MUL x1, ALPHA, x1 + srai.d I, M, 3 + MUL x2, ALPHA, x2 + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD y5, YY, 4 * SIZE + LD a6, AO2, 1 * SIZE + LD y6, YY, 5 * SIZE + LD a7, AO2, 2 * SIZE + LD y7, YY, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 8 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 9 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 10 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 11 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + MADD t1, a5, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD t2, a6, x2, t2 + addi.d AO2, AO2, 8 * SIZE + MADD t3, a7, x2, t3 + addi.d YY, YY, 8 * SIZE + MADD t4, a8, x2, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 1 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO2, 2 * SIZE + MADD y3, a3, x1, y3 + LD a8, AO2, 3 * SIZE + MADD y4, a4, x1, y4 + MADD y1, a5, x2, y1 + addi.d YY, YY, 4 * SIZE + MADD y2, a6, x2, y2 + addi.d AO1, AO1, 4 * SIZE + MADD y3, a7, x2, y3 + addi.d AO2, AO2, 4 * SIZE + MADD y4, a8, x2, y4 + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 2 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + addi.d YY, YY, 2 * SIZE + MADD y1, a5, x2, y1 + addi.d AO1, AO1, 2 * SIZE + MADD y2, a6, x2, y2 + addi.d AO2, AO2, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L17: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + MADD y1, a5, x2, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + .align 3 + +.L21: + LD x1, X, 0 * SIZE + add.d X, X, INCX + move YY, YORIG + move AO1, A + srai.d I, M, 3 + MUL x1, ALPHA, x1 + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD y5, YY, 4 * SIZE + LD y6, YY, 5 * SIZE + LD y7, YY, 6 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + MADD y3, a3, x1, y3 + addi.d YY, YY, 4 * SIZE + MADD y4, a4, x1, y4 + addi.d AO1, AO1, 4 * SIZE + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + MADD y1, a1, x1, y1 + addi.d YY, YY, 2 * SIZE + MADD y2, a2, x1, y2 + addi.d AO1, AO1, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L27: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + MADD y1, a1, x1, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L900: + li YORIG, SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + addi.d XX, XX, 4 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + addi.d XX, XX, 1 * SIZE + ST a1, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 + fld.d $f20, $sp, 32 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 48 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S new file mode 100644 index 000000000..19333ed4a --- /dev/null +++ b/kernel/loongarch64/gemv_t.S @@ -0,0 +1,436 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f1 +#define x3 $f2 +#define x4 $f4 +#define x5 $f5 +#define x6 $f6 +#define x7 $f7 +#define x8 $f18 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li I, SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a3, AO1, 1 * SIZE + LD x3, XX, 2 * SIZE + LD a4, AO2, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a5, AO1, 2 * SIZE + LD x5, XX, 4 * SIZE + LD a6, AO2, 2 * SIZE + LD x6, XX, 5 * SIZE + LD a7, AO1, 3 * SIZE + LD x7, XX, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y2, a2, x5, y2 + LD a2, AO2, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + MADD y4, a4, x6, y4 + LD a4, AO2, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y2, a6, x7, y2 + LD a6, AO2, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + MADD y4, a8, x8, y4 + LD a8, AO2, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y2, a2, x5, y2 + MADD y3, a3, x6, y3 + MADD y4, a4, x6, y4 + MADD y1, a5, x7, y1 + addi.d XX, XX, 8 * SIZE + MADD y2, a6, x7, y2 + addi.d AO1, AO1, 8 * SIZE + MADD y3, a7, x8, y3 + addi.d AO2, AO2, 8 * SIZE + MADD y4, a8, x8, y4 + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 2 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y4, a4, x2, y4 + LD a8, AO2, 3 * SIZE + MADD y1, a5, x3, y1 + MADD y2, a6, x3, y2 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + MADD y4, a8, x4, y4 + addi.d AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + ADD y2, y2, y4 + bge $r0, I, .L19 + .align 3 +.L18: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO2, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + addi.d AO2, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + blt $r0, I, .L18 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + addi.d J, J, -1 + MADD a2, y2, ALPHA, a2 + MTC y1, $r0 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + move AO1, A + bge $r0, J, .L999 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + LD a7, AO1, 3 * SIZE + LD x4, XX, 3 * SIZE + LD x5, XX, 4 * SIZE + LD x6, XX, 5 * SIZE + LD x7, XX, 6 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y3, a3, x6, y3 + MADD y1, a5, x7, y1 + MADD y3, a7, x8, y3 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y1, a5, x3, y1 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + bge $r0, I, .L29 + .align 3 +.L28: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + MADD y1, a1, x1, y1 + blt $r0, I, .L28 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S new file mode 100644 index 000000000..0f9e1bc59 --- /dev/null +++ b/kernel/loongarch64/iamax.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S new file mode 100644 index 000000000..7751a9d03 --- /dev/null +++ b/kernel/loongarch64/iamin.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S new file mode 100644 index 000000000..6d7cb9e30 --- /dev/null +++ b/kernel/loongarch64/izamax.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S new file mode 100644 index 000000000..998927985 --- /dev/null +++ b/kernel/loongarch64/izamin.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li x2, 1 + srai.d I, N, 2 + li x3, 1 + li TEMP, 2 + li x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S new file mode 100644 index 000000000..56c3f99a1 --- /dev/null +++ b/kernel/loongarch64/max.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, s1, a5 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, a6 + add.d X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, a8 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, s1, a1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S new file mode 100644 index 000000000..bb2fcfb01 --- /dev/null +++ b/kernel/loongarch64/min.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, a5, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, a6, s2 + add.d X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, a8, s4 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, a1, s1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S new file mode 100644 index 000000000..7399e57b3 --- /dev/null +++ b/kernel/loongarch64/scal.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, SIZE + MTC a1, $r0 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA, a1 + bceqz $fcc0, .L50 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 3 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + addi.d I, I, -1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 3 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L53 + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, X, 8 * SIZE + MUL t2, ALPHA, a2 + LD a2, X, 9 * SIZE + MUL t3, ALPHA, a3 + LD a3, X, 10 * SIZE + MUL t4, ALPHA, a4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + LD a5, X, 12 * SIZE + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + LD a6, X, 13 * SIZE + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + LD a7, X, 14 * SIZE + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + LD a8, X, 15 * SIZE + addi.d I, I, -1 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d X, X, SIZE + addi.d I, I, -1 + ST t1, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 3 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L63 + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a2 + LD a2, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a3 + LD a3, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a4 + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a6 + LD a6, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a7 + LD a7, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a8 + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S new file mode 100644 index 000000000..14b62cfe7 --- /dev/null +++ b/kernel/loongarch64/snrm2.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li TEMP, SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + LD a6, X, 5 * SIZE + fcvt.d.s t2, a2 + LD a7, X, 6 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + bge $r0, I, .L13 + .align 3 + +.L12: + fmadd.d s1, t1, t1, s1 + LD a1, X, 8 * SIZE + fcvt.d.s t1, a5 + NOP + fmadd.d s2, t2, t2, s2 + LD a2, X, 9 * SIZE + fcvt.d.s t2, a6 + NOP + fmadd.d s1, t3, t3, s1 + LD a3, X, 10 * SIZE + fcvt.d.s t3, a7 + NOP + fmadd.d s2, t4, t4, s2 + LD a4, X, 11 * SIZE + fcvt.d.s t4, a8 + NOP + fmadd.d s1, t1, t1, s1 + LD a5, X, 12 * SIZE + fcvt.d.s t1, a1 + NOP + fmadd.d s2, t2, t2, s2 + LD a6, X, 13 * SIZE + fcvt.d.s t2, a2 + addi.d I, I, -1 + fmadd.d s1, t3, t3, s1 + LD a7, X, 14 * SIZE + fcvt.d.s t3, a3 + addi.d X, X, 8 * SIZE + fmadd.d s2, t4, t4, s2 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + blt $r0, I, .L12 + .align 3 + +.L13: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + addi.d X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fmadd.d s1, t1, t1, s1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fcvt.d.s t3, a3 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a2, X, 0 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a4, X, 0 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a6, X, 0 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a8, X, 0 * SIZE + fcvt.d.s t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S new file mode 100644 index 000000000..c9d8f7fc1 --- /dev/null +++ b/kernel/loongarch64/swap.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define b5 $f0 +#define b6 $f1 +#define b7 $f2 +#define b8 $f3 + + PROLOGUE + + li TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + LD b4, Y, 3 * SIZE + LD a5, X, 4 * SIZE + LD b5, Y, 4 * SIZE + LD a6, X, 5 * SIZE + LD b6, Y, 5 * SIZE + LD a7, X, 6 * SIZE + LD b7, Y, 6 * SIZE + LD a8, X, 7 * SIZE + LD b8, Y, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST b1, X, 0 * SIZE + LD b1, Y, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST b2, X, 1 * SIZE + LD b2, Y, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST b3, X, 2 * SIZE + LD b3, Y, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST b4, X, 3 * SIZE + LD b4, Y, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST b5, X, 4 * SIZE + LD b5, Y, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST b6, X, 5 * SIZE + LD b6, Y, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST b7, X, 6 * SIZE + LD b7, Y, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + ST b8, X, 7 * SIZE + LD b8, Y, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + ST b2, X, 1 * SIZE + ST a3, Y, 2 * SIZE + ST b3, X, 2 * SIZE + ST a4, Y, 3 * SIZE + ST b4, X, 3 * SIZE + ST a5, Y, 4 * SIZE + ST b5, X, 4 * SIZE + ST a6, Y, 5 * SIZE + ST b6, X, 5 * SIZE + ST a7, Y, 6 * SIZE + ST b7, X, 6 * SIZE + ST a8, Y, 7 * SIZE + ST b8, X, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST b1, X, -1 * SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + move XX, X + move YY, Y + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + LD b8, Y, 0 * SIZE + add.d Y, Y, INCY + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + LD b8, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S new file mode 100644 index 000000000..a0bd29f3b --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN.S @@ -0,0 +1,2863 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + neg KK, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L20 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, c11 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c41, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + srai.d I, M, 1 + MOV c51, c11 +MOV c61, c11 + bge $r0, I, .L29 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + MOV c21, c11 + add.d CO4, CO3, LDC + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif + andi I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c31, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L40: + srai.d I, M, 1 + MOV c61, c11 +MOV c41, c11 + bge $r0, I, .L49 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + andi I, M, 1 + bge $r0, I, .L60 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L60: + srai.d I, M, 1 + bge $r0, I, .L69 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + andi I, M, 1 + bge $r0, I, .L80 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L80: + srai.d I, M, 1 + bge $r0, I, .L89 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S new file mode 100644 index 000000000..aa6822c32 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT.S @@ -0,0 +1,2854 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S new file mode 100644 index 000000000..c86d9c1e5 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT.S @@ -0,0 +1,2850 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + MOV c21, c11 + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MOV c21, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 +move AO, A + bge $r0, J, .L70 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L70: + srai.d J, N, 3 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S new file mode 100644 index 000000000..f998bdc23 --- /dev/null +++ b/kernel/loongarch64/zamax.S @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S new file mode 100644 index 000000000..bde9aebf8 --- /dev/null +++ b/kernel/loongarch64/zamin.S @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + NOP + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + NOP + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + NOP + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + NOP + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + NOP + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + + EPILOGUE diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S new file mode 100644 index 000000000..d1a1a732c --- /dev/null +++ b/kernel/loongarch64/zasum.S @@ -0,0 +1,158 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bge $r0, N, .L999 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 1 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 1 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + LD a8, X, 1 * SIZE + ADD s2, s2, t4 + add.d X, X, INCX + FABS t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + addi.d I, I, -1 + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t1 + ADD s2, s2, t2 + blt $r0, I, .L26 + .align 3 + +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S new file mode 100644 index 000000000..3fbe56074 --- /dev/null +++ b/kernel/loongarch64/zcopy.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, 2 * SIZE + NOP + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 2 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + ST a1, Y, -2 * SIZE + addi.d I, I, -1 + ST a2, Y, -1 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + .align 3 + +.L20: + srai.d I, N, 2 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + LD a1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + LD a3, X, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + LD a5, X, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + LD a6, X, 1 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + LD a7, X, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S new file mode 100644 index 000000000..087c3845f --- /dev/null +++ b/kernel/loongarch64/zdot.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + slli.d INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L14 + .align 3 + +.L13: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 8 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 9 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 8 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 9 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L13 + .align 3 + +.L14: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + addi.d X, X, 8 * SIZE + MADD s2, b3, a4, s2 + addi.d Y, Y, 8 * SIZE + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L17 + .align 3 + +.L16: + MADD s1, b1, a1, s1 + addi.d I, I, -1 + MADD s2, b1, a2, s2 + LD b1, Y, 2 * SIZE + MADD s3, b2, a1, s3 + LD a1, X, 2 * SIZE + MADD s4, b2, a2, s4 + LD a2, X, 3 * SIZE + LD b2, Y, 3 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L16 + .align 3 + +.L17: + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + bge $r0, I, .L24 + .align 3 + +.L23: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L23 + .align 3 + +.L24: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + MADD s1, b3, a3, s1 + add.d X, X, INCX + MADD s2, b3, a4, s2 + add.d Y, Y, INCY + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + add.d X, X, INCX + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L26 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 +#else + ADD $f0, s1, s4 +#endif +#ifndef CONJ + ADD $f1, s3, s2 +#else + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S new file mode 100644 index 000000000..f9acb6cfc --- /dev/null +++ b/kernel/loongarch64/zgemm3m_kernel.S @@ -0,0 +1,1359 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r11 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + slli.d LDC, LDC, ZBASE_SHIFT + srai.d J, N, 3 + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC +MOV c61, c11 + bge $r0, I, .L20 +.L11: + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: + andi L, K, 3 + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO2, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO2, 2 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + LD $f13, CO2, 3 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + ST $f11, CO2, 1 * SIZE + ST $f12, CO2, 2 * SIZE + ST $f13, CO2, 3 * SIZE + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + MADD $f8, c31, ALPHA_I, $f8 + MADD $f23, c32, ALPHA_R, $f23 + MADD $f9, c32, ALPHA_I, $f9 + MADD $f10, c41, ALPHA_R, $f10 + ST $f22, CO3, 0 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + ST $f8, CO3, 1 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + ST $f23, CO3, 2 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + ST $f9, CO3, 3 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO5, 2 * SIZE + LD $f9, CO5, 3 * SIZE + ST $f10, CO4, 0 * SIZE + ST $f11, CO4, 1 * SIZE + ST $f12, CO4, 2 * SIZE + ST $f13, CO4, 3 * SIZE + LD $f10, CO6, 0 * SIZE + LD $f11, CO6, 1 * SIZE + LD $f12, CO6, 2 * SIZE + LD $f13, CO6, 3 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + addi.d CO1,CO1, 4 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + addi.d CO2,CO2, 4 * SIZE + MADD $f23, c52, ALPHA_R, $f23 + addi.d CO3,CO3, 4 * SIZE + MADD $f9, c52, ALPHA_I, $f9 + addi.d CO4,CO4, 4 * SIZE + MADD $f10, c61, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c61, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c62, ALPHA_R, $f12 + ST $f23, CO5, 2 * SIZE + MADD $f13, c62, ALPHA_I, $f13 + ST $f9, CO5, 3 * SIZE + LD $f22, CO7, 0 * SIZE + LD $f8, CO7, 1 * SIZE + LD $f23, CO7, 2 * SIZE + LD $f9, CO7, 3 * SIZE + ST $f10, CO6, 0 * SIZE + ST $f11, CO6, 1 * SIZE + ST $f12, CO6, 2 * SIZE + ST $f13, CO6, 3 * SIZE + LD $f10, CO8, 0 * SIZE + addi.d I, I, -1 + LD $f11, CO8, 1 * SIZE +MTC c11, $r0 + LD $f12, CO8, 2 * SIZE + LD $f13, CO8, 3 * SIZE + MADD $f22, c71, ALPHA_R, $f22 + addi.d CO5,CO5, 4 * SIZE + MADD $f8, c71, ALPHA_I, $f8 + addi.d CO6,CO6, 4 * SIZE + MADD $f23, c72, ALPHA_R, $f23 + addi.d CO7,CO7, 4 * SIZE + MADD $f9, c72, ALPHA_I, $f9 + addi.d CO8,CO8, 4 * SIZE + MADD $f10, c81, ALPHA_R, $f10 + ST $f22, CO7, -4 * SIZE + MADD $f11, c81, ALPHA_I, $f11 + ST $f8, CO7, -3 * SIZE + MADD $f12, c82, ALPHA_R, $f12 + ST $f23, CO7, -2 * SIZE + MADD $f13, c82, ALPHA_I, $f13 + ST $f9, CO7, -1 * SIZE + ST $f10, CO8, -4 * SIZE + MOV c21, c11 + ST $f11, CO8, -3 * SIZE + MOV c31, c11 + ST $f12, CO8, -2 * SIZE + MOV c41, c11 + ST $f13, CO8, -1 * SIZE + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: + andi L, K, 3 + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO6, 0 * SIZE + LD $f9, CO6, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + LD $f10, CO7, 0 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + LD $f11, CO7, 1 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + LD $f12, CO8, 0 * SIZE + MADD $f23, c61, ALPHA_R, $f23 + LD $f13, CO8, 1 * SIZE + MADD $f9, c61, ALPHA_I, $f9 + MADD $f10, c71, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c71, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c81, ALPHA_R, $f12 + ST $f23, CO6, 0 * SIZE + MADD $f13, c81, ALPHA_I, $f13 + ST $f9, CO6, 1 * SIZE + ST $f10, CO7, 0 * SIZE + ST $f11, CO7, 1 * SIZE + ST $f12, CO8, 0 * SIZE + ST $f13, CO8, 1 * SIZE + .align 3 + +.L29: +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: + andi L, K, 3 + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + ST $f11, CO2, 1 * SIZE + MADD $f8, c31, ALPHA_I, $f8 + ST $f12, CO2, 2 * SIZE + MADD $f23, c32, ALPHA_R, $f23 + ST $f13, CO2, 3 * SIZE + MADD $f9, c32, ALPHA_I, $f9 + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f10, c41, ALPHA_R, $f10 + addi.d CO1,CO1, 4 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + addi.d CO2,CO2, 4 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + addi.d CO3,CO3, 4 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + addi.d CO4,CO4, 4 * SIZE + ST $f22, CO3, -4 * SIZE + addi.d I, I, -1 + ST $f8, CO3, -3 * SIZE + ST $f23, CO3, -2 * SIZE + ST $f9, CO3, -1 * SIZE + ST $f10, CO4, -4 * SIZE +MTC c11, $r0 + ST $f11, CO4, -3 * SIZE + MOV c21, c11 + ST $f12, CO4, -2 * SIZE + MOV c31, c11 + ST $f13, CO4, -1 * SIZE +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: + andi L, K, 3 + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: + andi L, K, 3 + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + addi.d I, I, -1 + MADD $f8, c11, ALPHA_I, $f8 + addi.d CO1,CO1, 4 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + addi.d CO2,CO2, 4 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + MADD $f11, c21, ALPHA_I, $f11 + MADD $f12, c22, ALPHA_R, $f12 + MADD $f13, c22, ALPHA_I, $f13 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + ST $f10, CO2, -4 * SIZE + ST $f11, CO2, -3 * SIZE + ST $f12, CO2, -2 * SIZE + ST $f13, CO2, -1 * SIZE + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 + srai.d L, K, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: + andi L, K, 3 + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c21, ALPHA_R, $f23 + MADD $f9, c21, ALPHA_I, $f9 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + ST $f23, CO2, 0 * SIZE + ST $f9, CO2, 1 * SIZE + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: + andi L, K, 3 + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 4 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: + andi L, K, 3 + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + ADD c11, c11, c21 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S new file mode 100644 index 000000000..2d50d41a5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel.S @@ -0,0 +1,1047 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r26 +#define TEMP $r27 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 64 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 + fst.d $f26, $sp, 32 + fst.d $f27, $sp, 40 + fst.d $f28, $sp, 48 + fst.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + SDARG $r26, $sp, 72 + SDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + add.d C, CO4, LDC + MOV c61, c11 + bge $r0, I, .L19 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + LD b5, CO3, 0 * SIZE + ADD c51, c51, c62 + LD b6, CO3, 1 * SIZE + ADD c52, c52, c61 + LD b7, CO4, 0 * SIZE + ADD c71, c71, c82 + LD b8, CO4, 1 * SIZE + ADD c72, c72, c81 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d CO3,CO3, 2 * SIZE + MADD b4, c32, ALPHA_R, b4 + addi.d CO4,CO4, 2 * SIZE + MADD b5, c51, ALPHA_R, b5 + addi.d I, I, -1 + MADD b6, c52, ALPHA_R, b6 + MADD b7, c71, ALPHA_R, b7 + MADD b8, c72, ALPHA_R, b8 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#else + ADD c11, c11, c22 + addi.d CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + addi.d CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + addi.d CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + addi.d CO4,CO4, 2 * SIZE + ADD c51, c51, c62 + addi.d I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L30 + add.d CO2, C, LDC + add.d C, CO2, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M +move AO, A + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, K, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d I, I, -1 + MADD b4, c32, ALPHA_R, b4 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + addi.d I, I, -1 + MUL b4, ALPHA_R, c32 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + ST b4, CO2, -1 * SIZE + blt $r0, I, .L21 + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L999 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M + add.d C, CO1, LDC +move AO, A + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, K, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 64 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 + fld.d $f26, $sp, 32 + fld.d $f27, $sp, 40 + fld.d $f28, $sp, 48 + fld.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + LDARG $r26, $sp, 72 + LDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + fmov.d $f1, $f23 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S new file mode 100644 index 000000000..0cc49c789 --- /dev/null +++ b/kernel/loongarch64/zgemv_n.S @@ -0,0 +1,648 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 +#define y1 $f3 +#define y2 $f4 +#define y3 $f2 +#define y4 $f5 +#define t1 $f6 +#define t2 $f7 +#define t3 $f18 +#define t4 $f19 +#define t5 $f20 +#define t6 $f21 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifndef __64BIT__ + addi.d $sp, $sp, -64 +#else + addi.d $sp, $sp, -32 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 +#ifndef __64BIT__ + fst.d $f18, $sp, 32 + fst.d $f19, $sp, 40 + fst.d $f20, $sp, 48 + fst.d $f21, $sp, 56 +#endif + slli.d LDA, LDA, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCY + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCY + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + LD x3, X, 0 * SIZE + LD x4, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + add.d AO2, A, LDA + MUL a3, ALPHA_R, x3 + add.d A, AO2, LDA + MUL a4, ALPHA_I, x3 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 + NMSUB x3, x4, ALPHA_I, a3 + MADD x4, x4, ALPHA_R, a4 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 + MADD x3, x4, ALPHA_I, a3 + MSUB x4, x4, ALPHA_R, a4 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L15 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + LD a7, AO2, 2 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 7 * SIZE + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + LD a5, AO2, 4 * SIZE + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + LD a7, AO2, 6 * SIZE + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + LD a6, AO2, 5 * SIZE + MADD3 t3, a8, x4, t3 + addi.d I, I, -1 + MADD4 t4, a8, x3, t4 + LD a8, AO2, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + MADD1 t5, a5, x3, t5 + ST t1, YY, 0 * SIZE + MADD2 t6, a5, x4, t6 + LD a5, AO2, 8 * SIZE + MADD1 t7, a7, x3, t7 + ST t2, YY, 1 * SIZE + MADD2 t8, a7, x4, t8 + LD a7, AO2, 10 * SIZE + MADD3 t5, a6, x4, t5 + ST t3, YY, 2 * SIZE + MADD4 t6, a6, x3, t6 + LD a6, AO2, 9 * SIZE + MADD3 t7, a8, x4, t7 + ST t4, YY, 3 * SIZE + MADD4 t8, a8, x3, t8 + LD a8, AO2, 11 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + MADD1 t1, a5, x3, t1 + ST t5, YY, 4 * SIZE + MADD2 t2, a5, x4, t2 + LD a5, AO2, 12 * SIZE + MADD1 t3, a7, x3, t3 + ST t6, YY, 5 * SIZE + MADD2 t4, a7, x4, t4 + LD a7, AO2, 14 * SIZE + MADD3 t1, a6, x4, t1 + ST t7, YY, 6 * SIZE + MADD4 t2, a6, x3, t2 + LD a6, AO2, 13 * SIZE + MADD3 t3, a8, x4, t3 + ST t8, YY, 7 * SIZE + MADD4 t4, a8, x3, t4 + LD a8, AO2, 15 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO2, AO2, 8 * SIZE + MADD3 t3, a8, x4, t3 + addi.d YY, YY, 8 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD1 t3, a3, x1, y3 + LD a7, AO2, 2 * SIZE + MADD2 t4, a3, x2, y4 + LD a8, AO2, 3 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a8, x4, t3 + addi.d AO2, AO2, 4 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L25 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + addi.d I, I, -1 + LD a4, AO1, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + ST t5, YY, 4 * SIZE + ST t6, YY, 5 * SIZE + ST t7, YY, 6 * SIZE + ST t8, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a2, x1, t2 + addi.d YY, YY, 8 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD1 t3, a3, x1, y3 + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a2, x1, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L900: + li YORIG, 2 * SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + LD a5, XX, 4 * SIZE + LD a6, XX, 5 * SIZE + LD a7, XX, 6 * SIZE + LD a8, XX, 7 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d XX, XX, 8 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d XX, XX, 2 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 +#ifndef __64BIT__ + fld.d $f18, $sp, 32 + fld.d $f19, $sp, 40 + fld.d $f20, $sp, 48 + fld.d $f21, $sp, 56 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 32 +#else + addi.d $sp, $sp, 64 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S new file mode 100644 index 000000000..85a9a0c0d --- /dev/null +++ b/kernel/loongarch64/zgemv_t.S @@ -0,0 +1,556 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f4 +#define x3 $f2 +#define x4 $f5 +#define x5 $f6 +#define x6 $f7 +#define x7 $f18 +#define x8 $f19 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, ZBASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li I, 2 * SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 2 + move XX, XORIG + bge $r0, I, .L15 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + addi.d I, I, -1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 8 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 9 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD1 y3, a7, x3, y3 + addi.d XX, XX, 8 * SIZE + MADD2 y4, a7, x4, y4 + LD a7, AO2, 10 * SIZE + MADD3 y1, a6, x4, y1 + addi.d AO2, AO2, 8 * SIZE + MADD4 y2, a6, x3, y2 + LD a6, AO1, 11 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 3 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 8 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 8 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 8 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L17 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x3, XX, 2 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 4 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 4 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 4 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L17: + andi I, M, 1 +.align 3 + + bge $r0, I, .L19 +.L18: + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + MADD1 y1, a1, x1, y1 + LD a2, AO1, 1 * SIZE + MADD2 y2, a1, x2, y2 + LD a4, AO2, 1 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + MADD a3, y3, ALPHA_R, a3 + MADD a4, y3, ALPHA_I, a4 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + NMSUB a3, y4, ALPHA_I, a3 + MTC y1, $r0 + MADD a4, y4, ALPHA_R, a4 + addi.d J, J, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + add.d YY, YY, INCY + ST a3, YY, 0 * SIZE + ST a4, YY, 1 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + srai.d I, M, 2 + bge $r0, J, .L999 + MOV y3, y1 + move AO1, A + MOV y4, y1 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x4, XX, 3 * SIZE + addi.d I, I, -1 + LD a6, AO1, 3 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 11 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 11 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 8 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a6, AO1, 3 * SIZE + MADD3 y3, a2, x2, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 4 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 +.align 3 + + bge $r0, I, .L29 +.L28: + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + ADD y1, y1, y3 + ADD y2, y2, y4 + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S new file mode 100644 index 000000000..49f640268 --- /dev/null +++ b/kernel/loongarch64/znrm2.S @@ -0,0 +1,304 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + MOV s2, s1 + srai.d I, N, 2 + MOV s3, s1 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + NOP + FABS t3, a3 + LD a2, X, 1 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a4, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + NOP + FABS t3, a7 + LD a6, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 2 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + addi.d I, I, -1 + MUL t3, ALPHA, a3 + LD a2, XX, 1 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + NOP + MADD s3, t3, t3, s3 + LD a4, XX, 1 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + NOP + MUL t3, ALPHA, a7 + LD a6, XX, 1 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + LD a8, XX, 1 * SIZE + MADD s3, t3, t3, s3 + add.d XX, XX, INCX + MADD s4, t4, t4, s4 + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MADD s1, t1, t1, s1 + add.d XX, XX, INCX + MADD s2, t2, t2, s2 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S new file mode 100644 index 000000000..fe53ed713 --- /dev/null +++ b/kernel/loongarch64/zscal.S @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li TEMP, 2 * SIZE + MTC a1, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + bceqz $fcc0, .L50 + bceqz $fcc1, .L50 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 2 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + ST a1, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 2 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + MUL t1, ALPHA_R, a1 + LD a7, X, 6 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 7 * SIZE + MUL t3, ALPHA_R, a3 + MUL t4, ALPHA_I, a3 + bge $r0, I, .L53 + .align 3 + +.L52: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 8 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 9 * SIZE + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 10 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 12 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 13 * SIZE + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 14 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 15 * SIZE + ST t1, X, 4 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, X, 5 * SIZE + MUL t2, ALPHA_I, a1 + ST t3, X, 6 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, X, 7 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d X, X, 2 * SIZE + addi.d I, I, -1 + ST t1, X, -2 * SIZE + ST t2, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 2 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + MUL t1, ALPHA_R, a1 + LD a7, X, 0 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 1 * SIZE + MUL t3, ALPHA_R, a3 + add.d X, X, INCX + MUL t4, ALPHA_I, a3 + bge $r0, I, .L63 + .align 3 + +.L62: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 0 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 0 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 0 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 0 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a1 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + ST t2, XX, 1 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + ST t4, XX, 1 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + ST t2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S new file mode 100644 index 000000000..26b1230b8 --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_LT.S @@ -0,0 +1,1344 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S new file mode 100644 index 000000000..e9f04362d --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_RT.S @@ -0,0 +1,1343 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L20 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + srai.d J, N, 2 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile new file mode 100644 index 000000000..71e5a87cb --- /dev/null +++ b/lapack/laswp/loongarch64/Makefile @@ -0,0 +1,12 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile diff --git a/param.h b/param.h index 965b97466..634e0ef5d 100644 --- a/param.h +++ b/param.h @@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined (LOONGSON3R5) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 From 4d7dfe4845078dbe57afed8bb4181451d8cd3734 Mon Sep 17 00:00:00 2001 From: Craig Watson Date: Tue, 27 Jul 2021 09:00:30 +0000 Subject: [PATCH 345/681] Include Haiku in processor count checks --- driver/others/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6e654ccf2..39ed264e8 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1979,7 +1979,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -1987,7 +1987,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -2011,7 +2011,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 02d4a49761f2ed74e0fe6943c3a3759ebed45ea3 Mon Sep 17 00:00:00 2001 From: Isuru Fernando Date: Thu, 15 Jul 2021 04:54:33 -0500 Subject: [PATCH 346/681] Also make sure the `1` is INTEGER*4 for OMP_SET_NUM_THREADS --- lapack-netlib/TESTING/EIG/cchkee.F | 8 +++++--- lapack-netlib/TESTING/EIG/dchkee.F | 5 +++-- lapack-netlib/TESTING/EIG/schkee.F | 5 +++-- lapack-netlib/TESTING/EIG/zchkee.F | 8 +++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index ab54078a3..ef9f71ec9 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1076,7 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1873,7 +1873,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL CERRST( 'CST', NOUT ) #if defined(_OPENMP) @@ -2340,7 +2341,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL CERRST( 'CHB', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index 6399fecef..89b6958fe 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1082,7 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1878,7 +1878,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL DERRST( 'DST', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index 5484a7c26..b58433959 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1082,7 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1879,7 +1879,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL SERRST( 'SST', NOUT ) #if defined(_OPENMP) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 7e9144d15..fb418a43b 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1076,7 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH - INTEGER*4 N_THREADS + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1873,7 +1873,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL ZERRST( 'ZST', NOUT ) #if defined(_OPENMP) @@ -2338,7 +2339,8 @@ IF( TSTERR ) THEN #if defined(_OPENMP) N_THREADS = OMP_GET_MAX_THREADS() - CALL OMP_SET_NUM_THREADS(1) + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) #endif CALL ZERRST( 'ZHB', NOUT ) #if defined(_OPENMP) From 34207bdf5b91373c08fbebf038b43e5b8c9ed7cf Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 30 Jul 2021 18:11:12 +0800 Subject: [PATCH 347/681] Fixed typos about LOONGARCH64 --- Makefile.system | 2 +- common_loongarch64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 4084390db..13c946ba1 100644 --- a/Makefile.system +++ b/Makefile.system @@ -856,7 +856,7 @@ BINARY_DEFINED = 1 endif ifeq ($(ARCH), loongarch64) -ifeq ($(CORE), LOONGSONG3R5) +ifeq ($(CORE), LOONGSON3R5) CCOMMON_OPT += -march=loongarch64 -mabi=lp64 FCOMMON_OPT += -march=loongarch64 -mabi=lp64 endif diff --git a/common_loongarch64.h b/common_loongarch64.h index 959e7e58a..e15539b5f 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -186,7 +186,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 20) -#define PAGESIZE (16UL << 1) +#define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #define HUGE_PAGESIZE ( 2 << 20) From cbc41973fde6137bc42c34de64a41b5a82b597c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:20:12 +0200 Subject: [PATCH 348/681] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- ctest/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 17f29fe69..f785d3f90 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() if(WIN32) FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 From b4f4ed378b2343b0af8b1235838feef4f6c8c51c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:21:08 +0200 Subject: [PATCH 349/681] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- test/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d338242ff..e4ee8b28b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,10 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() + if (BUILD_SINGLE) list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) From e78fbe46541dedcf39eb0362e69b1de6f7808642 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:44:54 +0200 Subject: [PATCH 350/681] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- ctest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest/Makefile b/ctest/Makefile index 15c83a907..c5e1094da 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,9 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif override TARGET_ARCH= override TARGET_MACH= From 5dc6aa74f05cc6c4405be195461fa5afc2c03888 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 30 Jul 2021 14:46:19 +0200 Subject: [PATCH 351/681] Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 --- test/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index 6c5f041c2..923f1537c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,8 @@ TOPDIR = .. include ../Makefile.system - +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif ifeq ($(NOFORTRAN),1) all :: From f2a7a67f5afa31e1e8839e5a386773e45bb5a687 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 31 Jul 2021 17:23:40 +0200 Subject: [PATCH 352/681] Improve the "tried to allocate too many buffers" error message --- driver/others/memory.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 39ed264e8..f0521ab2d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; - -#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; + +#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((*func != NULL) && (map_address == (void *) -1)) { + while ((func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1619,12 +1619,10 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif - +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #ifdef _WIN64 -static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1633,12 +1631,10 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif - +static int(*p_process_term)(void) = on_process_term; #ifdef _WIN64 -static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else -static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1672,23 +1668,16 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#elif !defined(OS_EMBEDDED) -#define ALLOC_MMAP -#define ALLOC_MALLOC #else +#define ALLOC_MMAP #define ALLOC_MALLOC - -inline int puts(const char *str) { return 0; } -inline int printf(const char *format, ...) { return 0; } -inline char *getenv(const char *name) { return ""; } -inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1702,6 +1691,7 @@ inline int atoi(const char *str) { return 0; } #include #include #include +#include #include #include #include @@ -1979,7 +1969,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1987,7 +1977,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif @@ -2011,7 +2001,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 0b8f7c8c10957aa1d7836cb8ae55337d180d5a75 Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 2 Aug 2021 10:00:41 +0800 Subject: [PATCH 353/681] Add cmake support for LOONGARCH64 --- cmake/arch.cmake | 4 ++++ cmake/cc.cmake | 9 +++++++++ cmake/fc.cmake | 7 +++++++ cmake/system_check.cmake | 4 +++- kernel/loongarch64/KERNEL | 2 ++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 154e59db6..57ee5a4fb 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -113,6 +113,10 @@ if (MIPS64) set(NO_BINARY_MODE 1) endif () +if (LOONGARCH64) + set(NO_BINARY_MODE 1) +endif () + if (${ARCH} STREQUAL "alpha") set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index ac5e455d5..1794b5e5b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") endif () + if (LOONGARCH64) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () + set(BINARY_DEFINED 1) + endif () + if (CMAKE_SYSTEM_NAME STREQUAL "AIX") set(BINARY_DEFINED 1) endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index fc1f9bb22..631664569 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () + if (LOONGARCH64) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fdc79c8ce..8d0558c0e 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") set(PPC 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") + set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -95,7 +97,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR PPC OR MIPS64) + if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) set(BINARY 64) else () set(BINARY 32) diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL index e96a90e72..1c11df9b6 100644 --- a/kernel/loongarch64/KERNEL +++ b/kernel/loongarch64/KERNEL @@ -234,3 +234,5 @@ endif ifndef ZGEMM3MKERNEL ZGEMM3MKERNEL = zgemm3m_kernel.S endif + +DSDOTKERNEL = dot.S From 0a2077901cf94877f6173f6b580762b68b2fd2e0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 28 Apr 2020 19:01:36 +0800 Subject: [PATCH 354/681] Add small marix optimization kernel interface. make SMALL_MATRIX_OPT=1 --- Makefile.system | 5 ++ common_d.h | 6 ++ common_level3.h | 12 ++++ common_macro.h | 16 +++++ common_s.h | 5 ++ interface/gemm.c | 28 +++++++- kernel/Makefile.L3 | 73 ++++++++++++++++++++ kernel/generic/gemm_small_matrix_kernel_nn.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_nt.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_tn.c | 49 +++++++++++++ kernel/generic/gemm_small_matrix_kernel_tt.c | 49 +++++++++++++ 11 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 kernel/generic/gemm_small_matrix_kernel_nn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_nt.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_tn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_tt.c diff --git a/Makefile.system b/Makefile.system index 13c946ba1..20d8d2f2a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -244,6 +244,11 @@ else ONLY_CBLAS = 0 endif +#For small matrix optimization +ifeq ($(SMALL_MATRIX_OPT), 1) +CCOMMON_OPT += -DSMALL_MATRIX_OPT +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 diff --git a/common_d.h b/common_d.h index 94dc3eea8..dad304a5f 100644 --- a/common_d.h +++ b/common_d.h @@ -157,6 +157,12 @@ #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k + +#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn +#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt +#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn +#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt + #else #define DAMAX_K gotoblas -> damax_k diff --git a/common_level3.h b/common_level3.h index c4f9435a9..751592b67 100644 --- a/common_level3.h +++ b/common_level3.h @@ -515,6 +515,18 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); #endif +#ifdef SMALL_MATRIX_OPT +int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + +int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +#endif + int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 0136f18ab..eb2abcdc0 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,6 +644,11 @@ #define GEADD_K DGEADD_K +#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT + #elif defined(BFLOAT16) #define D_TO_BF16_K SBDTOBF16_K @@ -931,6 +936,11 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT + #endif #else @@ -1236,6 +1246,12 @@ #define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K + +#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT + #endif #else #ifdef XDOUBLE diff --git a/common_s.h b/common_s.h index 34903ec49..6ad98ba8b 100644 --- a/common_s.h +++ b/common_s.h @@ -164,6 +164,11 @@ #define SGEADD_K sgeadd_k +#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn +#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt +#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn +#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt + #else #define SAMAX_K gotoblas -> samax_k diff --git a/interface/gemm.c b/interface/gemm.c index 10426fd8f..d2fb42ff7 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,6 +105,18 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; +#ifdef SMALL_MATRIX_OPT +//Only support s/dgemm small matrix optimiztion so far. +static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef COMPLEX + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, +#endif +#endif +}; +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -417,6 +429,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); + MNK = (double) args.m * (double) args.n * (double) args.k; + +#ifdef SMALL_MATRIX_OPT +#if !defined(COMPLEX) + //need to tune small matrices cases. + if(MNK <= 100.0*100.0*100.0){ + (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, + args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + return; + } +#endif +#endif + + buffer = (XFLOAT *)blas_memory_alloc(0); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); @@ -428,7 +454,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); #endif - MNK = (double) args.m * (double) args.n * (double) args.k; + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) args.nthreads = 1; else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d9e3ec36..88e5eb2d6 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -447,6 +447,19 @@ XBLASOBJS += \ endif +###### BLAS small matrix optimization ##### +ifeq ($(SMALL_MATRIX_OPT), 1) + +SBLASOBJS += \ + sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + +endif + ###### BLAS extensions ##### ifeq ($(BUILD_SINGLE),1) @@ -4237,3 +4250,63 @@ endif $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + + +###### BLAS small matrix optimization ##### + +ifndef DGEMM_SAMLL_K_NN +DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SAMLL_K_NT +DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SAMLL_K_TN +DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SAMLL_K_TT +DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + + +ifndef SGEMM_SAMLL_K_NN +SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SAMLL_K_NT +SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SAMLL_K_TN +SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SAMLL_K_TT +SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c new file mode 100644 index 000000000..efcc27cba --- /dev/null +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -0,0 +1,49 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +{ + //naive implemtation + //Column major + + BLASLONG i,j,k; + FLOAT result=0.0; + + for(i=0; i Date: Tue, 28 Apr 2020 22:35:36 +0800 Subject: [PATCH 355/681] Add alpha=1.0 beta=0.0 for small gemm. --- common_d.h | 5 ++ common_level3.h | 11 ++++ common_macro.h | 14 ++++ common_s.h | 5 ++ interface/gemm.c | 18 +++++- kernel/Makefile.L3 | 64 ++++++++++++++++++- .../gemm_small_matrix_kernel_a1b0_nn.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_nt.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_tn.c | 49 ++++++++++++++ .../gemm_small_matrix_kernel_a1b0_tt.c | 49 ++++++++++++++ 10 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_nt.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tn.c create mode 100644 kernel/generic/gemm_small_matrix_kernel_a1b0_tt.c diff --git a/common_d.h b/common_d.h index dad304a5f..f5d7935fa 100644 --- a/common_d.h +++ b/common_d.h @@ -163,6 +163,11 @@ #define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn #define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt +#define DGEMM_SMALL_KERNEL_A1B0_NN dgemm_small_kernel_a1b0_nn +#define DGEMM_SMALL_KERNEL_A1B0_NT dgemm_small_kernel_a1b0_nt +#define DGEMM_SMALL_KERNEL_A1B0_TN dgemm_small_kernel_a1b0_tn +#define DGEMM_SMALL_KERNEL_A1B0_TT dgemm_small_kernel_a1b0_tt + #else #define DAMAX_K gotoblas -> damax_k diff --git a/common_level3.h b/common_level3.h index 751592b67..31d514cd5 100644 --- a/common_level3.h +++ b/common_level3.h @@ -525,6 +525,17 @@ int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + +int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + #endif int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index eb2abcdc0..2f7263023 100644 --- a/common_macro.h +++ b/common_macro.h @@ -648,6 +648,10 @@ #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN DGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT DGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN DGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT DGEMM_SMALL_KERNEL_A1B0_TT #elif defined(BFLOAT16) @@ -941,6 +945,11 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT + #endif #else @@ -1252,6 +1261,11 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN +#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT +#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN +#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT + #endif #else #ifdef XDOUBLE diff --git a/common_s.h b/common_s.h index 6ad98ba8b..440b78723 100644 --- a/common_s.h +++ b/common_s.h @@ -169,6 +169,11 @@ #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn #define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt +#define SGEMM_SMALL_KERNEL_A1B0_NN sgemm_small_kernel_a1b0_nn +#define SGEMM_SMALL_KERNEL_A1B0_NT sgemm_small_kernel_a1b0_nt +#define SGEMM_SMALL_KERNEL_A1B0_TN sgemm_small_kernel_a1b0_tn +#define SGEMM_SMALL_KERNEL_A1B0_TT sgemm_small_kernel_a1b0_tt + #else #define SAMAX_K gotoblas -> samax_k diff --git a/interface/gemm.c b/interface/gemm.c index d2fb42ff7..da602f7a9 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -115,6 +115,15 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON #endif #endif }; + +static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef COMPLEX + GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL, + GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL, +#endif +#endif +}; #endif #ifndef CBLAS @@ -435,8 +444,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #if !defined(COMPLEX) //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, - args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + + if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){ + (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc); + }else{ + (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + } + return; } #endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 88e5eb2d6..448d22e4e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -452,11 +452,15 @@ ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) endif @@ -4282,6 +4286,34 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +ifndef DGEMM_SAMLL_K_A1B0_NN +DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_NT +DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_TN +DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +endif + +ifndef DGEMM_SAMLL_K_A1B0_TT +DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +endif + +$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + ifndef SGEMM_SAMLL_K_NN SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c @@ -4310,3 +4342,31 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SGEMM_SAMLL_K_A1B0_NN +SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_NT +SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_TN +SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +endif + +ifndef SGEMM_SAMLL_K_A1B0_TT +SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +endif + +$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c new file mode 100644 index 000000000..8e3417027 --- /dev/null +++ b/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c @@ -0,0 +1,49 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +{ + //naive implemtation + //Column major + + BLASLONG i,j,k; + FLOAT result=0.0; + + for(i=0; i Date: Tue, 28 Apr 2020 23:15:20 +0800 Subject: [PATCH 356/681] Fix gemm interface bug for small matrix. --- interface/gemm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index da602f7a9..4f1bbfd1c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -145,7 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#ifdef SMP +#if defined (SMP) || defined(SMALL_MATRIX_OPT) double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX @@ -269,8 +269,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *buffer; XFLOAT *sa, *sb; -#ifdef SMP +#if defined (SMP) || defined(SMALL_MATRIX_OPT) double MNK; +#endif + +#ifdef SMP #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE @@ -438,7 +441,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); +#if defined(SMP) || defined(SMALL_MATRIX_OPT) MNK = (double) args.m * (double) args.n * (double) args.k; +#endif #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) From 59cb5de46b89a080d1190e89bed543fd32f924c7 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 29 Apr 2020 00:19:19 +0800 Subject: [PATCH 357/681] Refs #2587 Fix typos. --- kernel/Makefile.L3 | 96 +++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 448d22e4e..6476334e9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4258,115 +4258,115 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) ###### BLAS small matrix optimization ##### -ifndef DGEMM_SAMLL_K_NN -DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +ifndef DGEMM_SMALL_K_NN +DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif -ifndef DGEMM_SAMLL_K_NT -DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +ifndef DGEMM_SMALL_K_NT +DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c endif -ifndef DGEMM_SAMLL_K_TN -DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +ifndef DGEMM_SMALL_K_TN +DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c endif -ifndef DGEMM_SAMLL_K_TT -DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +ifndef DGEMM_SMALL_K_TT +DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif -$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN) +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT) +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN) +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT) +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef DGEMM_SAMLL_K_A1B0_NN -DGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef DGEMM_SMALL_K_A1B0_NN +DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c endif -ifndef DGEMM_SAMLL_K_A1B0_NT -DGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef DGEMM_SMALL_K_A1B0_NT +DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c endif -ifndef DGEMM_SAMLL_K_A1B0_TN -DGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef DGEMM_SMALL_K_A1B0_TN +DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c endif -ifndef DGEMM_SAMLL_K_A1B0_TT -DGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef DGEMM_SMALL_K_A1B0_TT +DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c endif -$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NN) +$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_NT) +$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TN) +$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_A1B0_TT) +$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SAMLL_K_NN -SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +ifndef SGEMM_SMALL_K_NN +SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif -ifndef SGEMM_SAMLL_K_NT -SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +ifndef SGEMM_SMALL_K_NT +SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c endif -ifndef SGEMM_SAMLL_K_TN -SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +ifndef SGEMM_SMALL_K_TN +SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c endif -ifndef SGEMM_SAMLL_K_TT -SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +ifndef SGEMM_SMALL_K_TT +SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif -$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN) +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT) +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN) +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT) +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SAMLL_K_A1B0_NN -SGEMM_SAMLL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef SGEMM_SMALL_K_A1B0_NN +SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c endif -ifndef SGEMM_SAMLL_K_A1B0_NT -SGEMM_SAMLL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef SGEMM_SMALL_K_A1B0_NT +SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c endif -ifndef SGEMM_SAMLL_K_A1B0_TN -SGEMM_SAMLL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef SGEMM_SMALL_K_A1B0_TN +SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c endif -ifndef SGEMM_SAMLL_K_A1B0_TT -SGEMM_SAMLL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef SGEMM_SMALL_K_A1B0_TT +SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c endif -$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NN) +$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_NT) +$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TN) +$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_A1B0_TT) +$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ From 17d32a4a8271141be2fb96c8c767ac1ed2e60a36 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 28 Aug 2020 07:55:27 +0800 Subject: [PATCH 358/681] Change a1b0 gemm to b0 gemm. --- common_d.h | 8 +-- common_level3.h | 18 +++--- common_macro.h | 24 ++++---- common_s.h | 8 +-- interface/gemm.c | 10 ++-- kernel/Makefile.L3 | 56 +++++++++---------- ..._nn.c => gemm_small_matrix_kernel_b0_nn.c} | 4 +- ..._nt.c => gemm_small_matrix_kernel_b0_nt.c} | 4 +- ..._tn.c => gemm_small_matrix_kernel_b0_tn.c} | 4 +- ..._tt.c => gemm_small_matrix_kernel_b0_tt.c} | 4 +- 10 files changed, 70 insertions(+), 70 deletions(-) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nn.c => gemm_small_matrix_kernel_b0_nn.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_nt.c => gemm_small_matrix_kernel_b0_nt.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tn.c => gemm_small_matrix_kernel_b0_tn.c} (95%) rename kernel/generic/{gemm_small_matrix_kernel_a1b0_tt.c => gemm_small_matrix_kernel_b0_tt.c} (95%) diff --git a/common_d.h b/common_d.h index f5d7935fa..42c14e828 100644 --- a/common_d.h +++ b/common_d.h @@ -163,10 +163,10 @@ #define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn #define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt -#define DGEMM_SMALL_KERNEL_A1B0_NN dgemm_small_kernel_a1b0_nn -#define DGEMM_SMALL_KERNEL_A1B0_NT dgemm_small_kernel_a1b0_nt -#define DGEMM_SMALL_KERNEL_A1B0_TN dgemm_small_kernel_a1b0_tn -#define DGEMM_SMALL_KERNEL_A1B0_TT dgemm_small_kernel_a1b0_tt +#define DGEMM_SMALL_KERNEL_B0_NN dgemm_small_kernel_b0_nn +#define DGEMM_SMALL_KERNEL_B0_NT dgemm_small_kernel_b0_nt +#define DGEMM_SMALL_KERNEL_B0_TN dgemm_small_kernel_b0_tn +#define DGEMM_SMALL_KERNEL_B0_TT dgemm_small_kernel_b0_tt #else diff --git a/common_level3.h b/common_level3.h index 31d514cd5..7be7ab06b 100644 --- a/common_level3.h +++ b/common_level3.h @@ -526,15 +526,15 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int sgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int dgemm_small_kernel_a1b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int dgemm_small_kernel_a1b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif diff --git a/common_macro.h b/common_macro.h index 2f7263023..fa7884180 100644 --- a/common_macro.h +++ b/common_macro.h @@ -648,10 +648,10 @@ #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN DGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT DGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN DGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT DGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT #elif defined(BFLOAT16) @@ -945,10 +945,10 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT #endif @@ -1261,10 +1261,10 @@ #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_A1B0_NN SGEMM_SMALL_KERNEL_A1B0_NN -#define GEMM_SMALL_KERNEL_A1B0_NT SGEMM_SMALL_KERNEL_A1B0_NT -#define GEMM_SMALL_KERNEL_A1B0_TN SGEMM_SMALL_KERNEL_A1B0_TN -#define GEMM_SMALL_KERNEL_A1B0_TT SGEMM_SMALL_KERNEL_A1B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT #endif #else diff --git a/common_s.h b/common_s.h index 440b78723..685d73062 100644 --- a/common_s.h +++ b/common_s.h @@ -169,10 +169,10 @@ #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn #define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt -#define SGEMM_SMALL_KERNEL_A1B0_NN sgemm_small_kernel_a1b0_nn -#define SGEMM_SMALL_KERNEL_A1B0_NT sgemm_small_kernel_a1b0_nt -#define SGEMM_SMALL_KERNEL_A1B0_TN sgemm_small_kernel_a1b0_tn -#define SGEMM_SMALL_KERNEL_A1B0_TT sgemm_small_kernel_a1b0_tt +#define SGEMM_SMALL_KERNEL_B0_NN sgemm_small_kernel_b0_nn +#define SGEMM_SMALL_KERNEL_B0_NT sgemm_small_kernel_b0_nt +#define SGEMM_SMALL_KERNEL_B0_TN sgemm_small_kernel_b0_tn +#define SGEMM_SMALL_KERNEL_B0_TT sgemm_small_kernel_b0_tt #else diff --git a/interface/gemm.c b/interface/gemm.c index 4f1bbfd1c..3730f37fa 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -116,11 +116,11 @@ static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLON #endif }; -static int (*gemm_small_kernel_a1b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M #ifndef COMPLEX - GEMM_SMALL_KERNEL_A1B0_NN, GEMM_SMALL_KERNEL_A1B0_TN, NULL, NULL, - GEMM_SMALL_KERNEL_A1B0_NT, GEMM_SMALL_KERNEL_A1B0_TT, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, #endif #endif }; @@ -450,8 +450,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - if(*(FLOAT *)(args.alpha) == 1.0 && *(FLOAT *)(args.beta) == 0.0){ - (gemm_small_kernel_a1b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda,args.b, args.ldb, args.c, args.ldc); + if(*(FLOAT *)(args.beta) == 0.0){ + (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 6476334e9..c9544086a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -453,14 +453,14 @@ ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ - sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) + sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) \ - dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) + dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) endif @@ -4286,32 +4286,32 @@ $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -ifndef DGEMM_SMALL_K_A1B0_NN -DGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef DGEMM_SMALL_K_B0_NN +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c endif -ifndef DGEMM_SMALL_K_A1B0_NT -DGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef DGEMM_SMALL_K_B0_NT +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c endif -ifndef DGEMM_SMALL_K_A1B0_TN -DGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef DGEMM_SMALL_K_B0_TN +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c endif -ifndef DGEMM_SMALL_K_A1B0_TT -DGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef DGEMM_SMALL_K_B0_TT +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c endif -$(KDIR)dgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NN) +$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_NT) +$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TN) +$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)dgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_A1B0_TT) +$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -4343,30 +4343,30 @@ $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifndef SGEMM_SMALL_K_A1B0_NN -SGEMM_SMALL_K_A1B0_NN = ../generic/gemm_small_matrix_kernel_a1b0_nn.c +ifndef SGEMM_SMALL_K_B0_NN +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c endif -ifndef SGEMM_SMALL_K_A1B0_NT -SGEMM_SMALL_K_A1B0_NT = ../generic/gemm_small_matrix_kernel_a1b0_nt.c +ifndef SGEMM_SMALL_K_B0_NT +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c endif -ifndef SGEMM_SMALL_K_A1B0_TN -SGEMM_SMALL_K_A1B0_TN = ../generic/gemm_small_matrix_kernel_a1b0_tn.c +ifndef SGEMM_SMALL_K_B0_TN +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c endif -ifndef SGEMM_SMALL_K_A1B0_TT -SGEMM_SMALL_K_A1B0_TT = ../generic/gemm_small_matrix_kernel_a1b0_tt.c +ifndef SGEMM_SMALL_K_B0_TT +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c endif -$(KDIR)sgemm_small_kernel_a1b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NN) +$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_NT) +$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TN) +$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)sgemm_small_kernel_a1b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_A1B0_TT) +$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c similarity index 95% rename from kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c rename to kernel/generic/gemm_small_matrix_kernel_b0_nn.c index 8e3417027..3be918017 100644 --- a/kernel/generic/gemm_small_matrix_kernel_a1b0_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) { //naive implemtation //Column major @@ -41,7 +41,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT * B for(k=0; k Date: Fri, 28 Aug 2020 21:00:54 +0800 Subject: [PATCH 359/681] Refs #2587 Add small matrix optimization reference kernel for c/zgemm. --- common_c.h | 40 +++ common_level3.h | 80 +++++ common_macro.h | 80 +++++ common_z.h | 40 +++ interface/gemm.c | 35 ++- kernel/Makefile.L3 | 293 ++++++++++++++++++ .../generic/zgemm_small_matrix_kernel_b0_nn.c | 74 +++++ .../generic/zgemm_small_matrix_kernel_b0_nt.c | 77 +++++ .../generic/zgemm_small_matrix_kernel_b0_tn.c | 77 +++++ .../generic/zgemm_small_matrix_kernel_b0_tt.c | 77 +++++ kernel/generic/zgemm_small_matrix_kernel_nn.c | 78 +++++ kernel/generic/zgemm_small_matrix_kernel_nt.c | 82 +++++ kernel/generic/zgemm_small_matrix_kernel_tn.c | 82 +++++ kernel/generic/zgemm_small_matrix_kernel_tt.c | 82 +++++ 14 files changed, 1193 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_nt.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tn.c create mode 100644 kernel/generic/zgemm_small_matrix_kernel_tt.c diff --git a/common_c.h b/common_c.h index 40ecf5b8b..9388ece93 100644 --- a/common_c.h +++ b/common_c.h @@ -232,6 +232,46 @@ #define CGEADD_K cgeadd_k +#define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn +#define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt +#define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr +#define CGEMM_SMALL_KERNEL_NC cgemm_small_kernel_nc + +#define CGEMM_SMALL_KERNEL_TN cgemm_small_kernel_tn +#define CGEMM_SMALL_KERNEL_TT cgemm_small_kernel_tt +#define CGEMM_SMALL_KERNEL_TR cgemm_small_kernel_tr +#define CGEMM_SMALL_KERNEL_TC cgemm_small_kernel_tc + +#define CGEMM_SMALL_KERNEL_RN cgemm_small_kernel_rn +#define CGEMM_SMALL_KERNEL_RT cgemm_small_kernel_rt +#define CGEMM_SMALL_KERNEL_RR cgemm_small_kernel_rr +#define CGEMM_SMALL_KERNEL_RC cgemm_small_kernel_rc + +#define CGEMM_SMALL_KERNEL_CN cgemm_small_kernel_cn +#define CGEMM_SMALL_KERNEL_CT cgemm_small_kernel_ct +#define CGEMM_SMALL_KERNEL_CR cgemm_small_kernel_cr +#define CGEMM_SMALL_KERNEL_CC cgemm_small_kernel_cc + +#define CGEMM_SMALL_KERNEL_B0_NN cgemm_small_kernel_b0_nn +#define CGEMM_SMALL_KERNEL_B0_NT cgemm_small_kernel_b0_nt +#define CGEMM_SMALL_KERNEL_B0_NR cgemm_small_kernel_b0_nr +#define CGEMM_SMALL_KERNEL_B0_NC cgemm_small_kernel_b0_nc + +#define CGEMM_SMALL_KERNEL_B0_TN cgemm_small_kernel_b0_tn +#define CGEMM_SMALL_KERNEL_B0_TT cgemm_small_kernel_b0_tt +#define CGEMM_SMALL_KERNEL_B0_TR cgemm_small_kernel_b0_tr +#define CGEMM_SMALL_KERNEL_B0_TC cgemm_small_kernel_b0_tc + +#define CGEMM_SMALL_KERNEL_B0_RN cgemm_small_kernel_b0_rn +#define CGEMM_SMALL_KERNEL_B0_RT cgemm_small_kernel_b0_rt +#define CGEMM_SMALL_KERNEL_B0_RR cgemm_small_kernel_b0_rr +#define CGEMM_SMALL_KERNEL_B0_RC cgemm_small_kernel_b0_rc + +#define CGEMM_SMALL_KERNEL_B0_CN cgemm_small_kernel_b0_cn +#define CGEMM_SMALL_KERNEL_B0_CT cgemm_small_kernel_b0_ct +#define CGEMM_SMALL_KERNEL_B0_CR cgemm_small_kernel_b0_cr +#define CGEMM_SMALL_KERNEL_B0_CC cgemm_small_kernel_b0_cc + #else #define CAMAX_K gotoblas -> camax_k diff --git a/common_level3.h b/common_level3.h index 7be7ab06b..5741f56d5 100644 --- a/common_level3.h +++ b/common_level3.h @@ -536,6 +536,86 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + #endif int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index fa7884180..2cccf9b39 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2093,6 +2093,46 @@ #define GEADD_K ZGEADD_K +#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC + #else #define AMAX_K CAMAX_K @@ -2516,6 +2556,46 @@ #define GEADD_K CGEADD_K +#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC + #endif #endif diff --git a/common_z.h b/common_z.h index f1e78dd08..8594ec74d 100644 --- a/common_z.h +++ b/common_z.h @@ -232,6 +232,46 @@ #define ZGEADD_K zgeadd_k +#define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn +#define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt +#define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr +#define ZGEMM_SMALL_KERNEL_NC zgemm_small_kernel_nc + +#define ZGEMM_SMALL_KERNEL_TN zgemm_small_kernel_tn +#define ZGEMM_SMALL_KERNEL_TT zgemm_small_kernel_tt +#define ZGEMM_SMALL_KERNEL_TR zgemm_small_kernel_tr +#define ZGEMM_SMALL_KERNEL_TC zgemm_small_kernel_tc + +#define ZGEMM_SMALL_KERNEL_RN zgemm_small_kernel_rn +#define ZGEMM_SMALL_KERNEL_RT zgemm_small_kernel_rt +#define ZGEMM_SMALL_KERNEL_RR zgemm_small_kernel_rr +#define ZGEMM_SMALL_KERNEL_RC zgemm_small_kernel_rc + +#define ZGEMM_SMALL_KERNEL_CN zgemm_small_kernel_cn +#define ZGEMM_SMALL_KERNEL_CT zgemm_small_kernel_ct +#define ZGEMM_SMALL_KERNEL_CR zgemm_small_kernel_cr +#define ZGEMM_SMALL_KERNEL_CC zgemm_small_kernel_cc + +#define ZGEMM_SMALL_KERNEL_B0_NN zgemm_small_kernel_b0_nn +#define ZGEMM_SMALL_KERNEL_B0_NT zgemm_small_kernel_b0_nt +#define ZGEMM_SMALL_KERNEL_B0_NR zgemm_small_kernel_b0_nr +#define ZGEMM_SMALL_KERNEL_B0_NC zgemm_small_kernel_b0_nc + +#define ZGEMM_SMALL_KERNEL_B0_TN zgemm_small_kernel_b0_tn +#define ZGEMM_SMALL_KERNEL_B0_TT zgemm_small_kernel_b0_tt +#define ZGEMM_SMALL_KERNEL_B0_TR zgemm_small_kernel_b0_tr +#define ZGEMM_SMALL_KERNEL_B0_TC zgemm_small_kernel_b0_tc + +#define ZGEMM_SMALL_KERNEL_B0_RN zgemm_small_kernel_b0_rn +#define ZGEMM_SMALL_KERNEL_B0_RT zgemm_small_kernel_b0_rt +#define ZGEMM_SMALL_KERNEL_B0_RR zgemm_small_kernel_b0_rr +#define ZGEMM_SMALL_KERNEL_B0_RC zgemm_small_kernel_b0_rc + +#define ZGEMM_SMALL_KERNEL_B0_CN zgemm_small_kernel_b0_cn +#define ZGEMM_SMALL_KERNEL_B0_CT zgemm_small_kernel_b0_ct +#define ZGEMM_SMALL_KERNEL_B0_CR zgemm_small_kernel_b0_cr +#define ZGEMM_SMALL_KERNEL_B0_CC zgemm_small_kernel_b0_cc + #else #define ZAMAX_K gotoblas -> zamax_k diff --git a/interface/gemm.c b/interface/gemm.c index 3730f37fa..b73baa9bd 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -124,6 +124,28 @@ static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLAS #endif #endif }; + +static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifdef COMPLEX + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, + GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, + GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, +#endif +#endif +}; + +static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifdef COMPLEX + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, + GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, + GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, +#endif +#endif +}; #endif #ifndef CBLAS @@ -446,20 +468,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #ifdef SMALL_MATRIX_OPT -#if !defined(COMPLEX) //need to tune small matrices cases. if(MNK <= 100.0*100.0*100.0){ - + +#if !defined(COMPLEX) if(*(FLOAT *)(args.beta) == 0.0){ (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } - +#else + if(beta[0] == 0.0 && beta[1] == 0.0){ + (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + }else{ + (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc); + } +#endif return; } #endif -#endif buffer = (XFLOAT *)blas_memory_alloc(0); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index c9544086a..1c4a00158 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -462,6 +462,42 @@ DBLASOBJS += \ dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +CBLASOBJS += \ + cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + endif ###### BLAS extensions ##### @@ -4370,3 +4406,260 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + + +ifndef CGEMM_SMALL_K_NN +CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_NT +CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_TN +CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_TT +CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef CGEMM_SMALL_K_B0_NN +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +endif + +ifndef CGEMM_SMALL_K_B0_NT +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +endif + +ifndef CGEMM_SMALL_K_B0_TN +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +endif + +ifndef CGEMM_SMALL_K_B0_TT +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_NN +ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_NT +ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_TN +ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_TT +ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_B0_NN +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +endif + +ifndef ZGEMM_SMALL_K_B0_NT +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +endif + +ifndef ZGEMM_SMALL_K_B0_TN +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +endif + +ifndef ZGEMM_SMALL_K_B0_TT +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c new file mode 100644 index 000000000..11e746e52 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c @@ -0,0 +1,74 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(NR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c new file mode 100644 index 000000000..1ef743017 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(NC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c new file mode 100644 index 000000000..2cd3ebcf2 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(TR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c new file mode 100644 index 000000000..25b05b4aa --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(TC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c new file mode 100644 index 000000000..6ef1b9655 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(NR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RN) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#elif defined(RR) + real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] + -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] + - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c new file mode 100644 index 000000000..3c81ad79e --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(NT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(NC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RT) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#elif defined(RC) + real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] + -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] + - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c new file mode 100644 index 000000000..143190bb1 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(TR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CN) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#elif defined(CR) + real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] + -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] + - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c new file mode 100644 index 000000000..246e26e84 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c @@ -0,0 +1,82 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +{ + FLOAT real, imag; + FLOAT tmp0, tmp1; + int i, j, l; + for(i = 0; i < M; i++){ + for(j = 0; j < N; j++){ + real=0; + imag=0; + + for(l = 0; l < K; l++){ +#if defined(TT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(TC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CT) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#elif defined(CC) + real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] + -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); + + imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] + - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); + +#endif + } + + tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + + + C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + } + } + + return 0; +} From 6022e5629c7708b114a3c2387e652ebd32122300 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 28 Aug 2020 22:36:36 +0800 Subject: [PATCH 360/681] Refs #2587 fix small matrix c/zgemm bug. --- common_level3.h | 150 +++++++++--------- interface/gemm.c | 22 ++- .../generic/zgemm_small_matrix_kernel_b0_nn.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_nt.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_tn.c | 6 +- .../generic/zgemm_small_matrix_kernel_b0_tt.c | 6 +- kernel/generic/zgemm_small_matrix_kernel_nn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_nt.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tn.c | 10 +- kernel/generic/zgemm_small_matrix_kernel_tt.c | 10 +- 10 files changed, 116 insertions(+), 120 deletions(-) diff --git a/common_level3.h b/common_level3.h index 5741f56d5..a3a487dab 100644 --- a/common_level3.h +++ b/common_level3.h @@ -536,85 +536,85 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); -int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); -int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * beta, float * C, BLASLONG ldc); - -int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); -int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); -int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * beta, double * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); -int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float * alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); - -int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); -int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double * alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); #endif diff --git a/interface/gemm.c b/interface/gemm.c index b73baa9bd..7251993ee 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -106,47 +106,43 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B }; #ifdef SMALL_MATRIX_OPT -//Only support s/dgemm small matrix optimiztion so far. + +#ifndef COMPLEX static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, #endif -#endif }; static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifndef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, #endif -#endif }; -static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *,FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG) = { +#else + +static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, #endif -#endif }; -static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { #ifndef GEMM3M -#ifdef COMPLEX GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, #endif -#endif }; #endif +#endif #ifndef CBLAS @@ -479,9 +475,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS } #else if(beta[0] == 0.0 && beta[1] == 0.0){ - (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ - (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, (FLOAT *)(args.alpha), args.b, args.ldb, (FLOAT *)(args.beta), args.c, args.ldc); + (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } #endif return; diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c index 11e746e52..3ab057fef 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; @@ -65,8 +65,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c index 1ef743017..dc35f4a6d 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c index 2cd3ebcf2..479a56e8f 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c index 25b05b4aa..b698973dd 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_b0_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; int i, j, l; @@ -68,8 +68,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - C[j*2*ldc + 2*i] = alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nn.c b/kernel/generic/zgemm_small_matrix_kernel_nn.c index 6ef1b9655..4bf6bf7ee 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -65,12 +65,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_nt.c b/kernel/generic/zgemm_small_matrix_kernel_nt.c index 3c81ad79e..288e49c13 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_nt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_nt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tn.c b/kernel/generic/zgemm_small_matrix_kernel_tn.c index 143190bb1..1e2a5aed4 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tn.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tn.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } diff --git a/kernel/generic/zgemm_small_matrix_kernel_tt.c b/kernel/generic/zgemm_small_matrix_kernel_tt.c index 246e26e84..180043539 100644 --- a/kernel/generic/zgemm_small_matrix_kernel_tt.c +++ b/kernel/generic/zgemm_small_matrix_kernel_tt.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* alpha, FLOAT * B, BLASLONG ldb, FLOAT* beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) { FLOAT real, imag; FLOAT tmp0, tmp1; @@ -69,12 +69,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT* al #endif } - tmp0 = beta[0]*C[j*2*ldc + 2*i] - beta[1]*C[j*2*ldc+ 2*i + 1]; - tmp1 = beta[0]*C[j*2*ldc+ 2*i + 1] + beta[1]*C[j*2*ldc + 2*i]; + tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; + tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; - C[j*2*ldc + 2*i] =tmp0+ alpha[0]*real - alpha[1]*imag; - C[j*2*ldc+ 2*i + 1] = tmp1+ alpha[0]*imag + real*alpha[1]; + C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; + C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; } } From 9186456a1297f7ee97bae56370c404114933a5ee Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sat, 8 May 2021 10:45:10 +0000 Subject: [PATCH 361/681] small matrix: SkylakeX: add SGEMM NN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_nn_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_nn_skylakex.c | 424 ++++++++++++++++++ 3 files changed, 428 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_nn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3d71584fe..1a2e67b52 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,6 +10,8 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c new file mode 100644 index 000000000..704e964b8 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..f2c79873e --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,424 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#else +#define STORE_512(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ + _mm512_storeu_ps(&C[offset##M##N], result##M##N) +#endif + +#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() +#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ + _mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#else +#define STORE_256(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \ + result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \ + _mm256_storeu_ps(&C[offset##M##N], result##M##N) +#endif + +#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):) +#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)]) +#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ + _mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N) +#else +#define STORE_128(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \ + result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) +#endif + +#define DECLARE_RESULT_S(M, N) float result##M##N = 0; +#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M] +#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)] +#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N +#if defined(B0) +#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha +#else +#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + __mmask8 mask = 0xff; // just use to avoid SSE instruction + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m8; i += 8) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + DECLARE_RESULT_256(0, 2); + DECLARE_RESULT_256(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); + BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3); + + MATMUL_256(0, 0); + MATMUL_256(0, 1); + MATMUL_256(0, 2); + MATMUL_256(0, 3); + } + STORE_256(0, 0); + STORE_256(0, 1); + STORE_256(0, 2); + STORE_256(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_256(0, 0); + DECLARE_RESULT_256(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); + MATMUL_256(0, 0); + MATMUL_256(0, 1); + } + STORE_256(0, 0); + STORE_256(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_256(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_256(0, x); + BROADCAST_LOAD_B_256(x, 0); + MATMUL_256(0, 0); + } + STORE_256(0, 0); + } + } + __m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + DECLARE_RESULT_128(0, 2); + DECLARE_RESULT_128(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); + BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3); + + MATMUL_128(0, 0); + MATMUL_128(0, 1); + MATMUL_128(0, 2); + MATMUL_128(0, 3); + } + STORE_128(0, 0); + STORE_128(0, 1); + STORE_128(0, 2); + STORE_128(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_128(0, 0); + DECLARE_RESULT_128(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); + MATMUL_128(0, 0); + MATMUL_128(0, 1); + } + STORE_128(0, 0); + STORE_128(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_128(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_128(0, x); + BROADCAST_LOAD_B_128(x, 0); + MATMUL_128(0, 0); + } + STORE_128(0, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); + DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2); + DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); + + MATMUL_S(0, 0); MATMUL_S(1, 0); + MATMUL_S(0, 1); MATMUL_S(1, 1); + MATMUL_S(0, 2); MATMUL_S(1, 2); + MATMUL_S(0, 3); MATMUL_S(1, 3); + } + STORE_S(0, 0); STORE_S(1, 0); + STORE_S(0, 1); STORE_S(1, 1); + STORE_S(0, 2); STORE_S(1, 2); + STORE_S(0, 3); STORE_S(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + MATMUL_S(0, 0); MATMUL_S(1, 0); + MATMUL_S(0, 1); MATMUL_S(1, 1); + } + STORE_S(0, 0); STORE_S(1, 0); + STORE_S(0, 1); STORE_S(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); + MATMUL_S(0, 0); MATMUL_S(1, 0); + } + STORE_S(0, 0); STORE_S(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_S(0, 0); + DECLARE_RESULT_S(0, 1); + DECLARE_RESULT_S(0, 2); + DECLARE_RESULT_S(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); + + MATMUL_S(0, 0); + MATMUL_S(0, 1); + MATMUL_S(0, 2); + MATMUL_S(0, 3); + } + STORE_S(0, 0); + STORE_S(0, 1); + STORE_S(0, 2); + STORE_S(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_S(0, 0); + DECLARE_RESULT_S(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); + BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); + MATMUL_S(0, 0); + MATMUL_S(0, 1); + } + STORE_S(0, 0); + STORE_S(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_S(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_S(0, x); LOAD_A_S(1, x); + BROADCAST_LOAD_B_S(x, 0); + MATMUL_S(0, 0); + } + STORE_S(0, 0); + } + } +} From f88470323bdb72a1e3ac54717606810699319d3b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sat, 8 May 2021 15:59:14 +0000 Subject: [PATCH 362/681] Optimize M < 16 using AVX512 mask --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index f2c79873e..f0b6d63a6 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -31,17 +31,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) #if defined(B0) #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #else #define STORE_512(M, N) \ BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ _mm512_storeu_ps(&C[offset##M##N], result##M##N) +#define MASK_STORE_512(M, N) \ + BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) #endif #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() @@ -241,6 +249,51 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } + if (M - i > 0) { + register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + return; + } __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); From 49b61a3f3027e24f19e78e573e50c86432aec574 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 11 May 2021 10:24:10 +0000 Subject: [PATCH 363/681] Small Matrix: skylakex: sgemm_nn: optimize for M <= 8 --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 302 +++++++++++++++++- 1 file changed, 301 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index f0b6d63a6..ae4a9daa3 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" #include +#include #define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() #define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) @@ -52,6 +53,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) #endif +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#endif + + + #define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() #define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) #define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) @@ -249,7 +262,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } - if (M - i > 0) { + if (M - i > 8) { register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); @@ -294,6 +307,293 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return; } + int mm = M - i; + if (mm) { + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask8 = (1UL << mm) - 1; + __mmask16 mask; + BLASLONG k16 = K & ~15; + BLASLONG k8 = K & ~7; + for (k = 0; k < k8; k += 8) { + __m256 r0, r1, r2, r3, r4, r5, r6, r7; + __m256 t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(3 + k)]); + r4 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(4 + k)]); + r5 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(5 + k)]); + r6 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(6 + k)]); + r7 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(7 + k)]); + + t0 = _mm256_unpacklo_ps(r0, r1); + t1 = _mm256_unpackhi_ps(r0, r1); + t2 = _mm256_unpacklo_ps(r2, r3); + t3 = _mm256_unpackhi_ps(r2, r3); + t4 = _mm256_unpacklo_ps(r4, r5); + t5 = _mm256_unpackhi_ps(r4, r5); + t6 = _mm256_unpacklo_ps(r6, r7); + t7 = _mm256_unpackhi_ps(r6, r7); + + r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0)); + r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2)); + r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); + r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); + r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0)); + r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2)); + r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); + r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); + + t0 = _mm256_permute2f128_ps(r0, r4, 0x20); + t1 = _mm256_permute2f128_ps(r1, r5, 0x20); + t2 = _mm256_permute2f128_ps(r2, r6, 0x20); + t3 = _mm256_permute2f128_ps(r3, r7, 0x20); + t4 = _mm256_permute2f128_ps(r0, r4, 0x31); + t5 = _mm256_permute2f128_ps(r1, r5, 0x31); + t6 = _mm256_permute2f128_ps(r2, r6, 0x31); + t7 = _mm256_permute2f128_ps(r3, r7, 0x31); + + switch (mm) { + case 8: _mm256_storeu_ps(&mbuf[k + 7*K], t7); + case 7: _mm256_storeu_ps(&mbuf[k + 6*K], t6); + case 6: _mm256_storeu_ps(&mbuf[k + 5*K], t5); + case 5: _mm256_storeu_ps(&mbuf[k + 4*K], t4); + case 4: _mm256_storeu_ps(&mbuf[k + 3*K], t3); + case 3: _mm256_storeu_ps(&mbuf[k + 2*K], t2); + case 2: _mm256_storeu_ps(&mbuf[k + 1*K], t1); + case 1: _mm256_storeu_ps(&mbuf[k + 0*K], t0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); + STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2); + STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); + STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + STORE_REDUCE(0, 2); + STORE_REDUCE(0, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + return; + } __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); From 3d8c6d9607c82a999ad8661834d0d78605a5f321 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 11 May 2021 10:33:07 +0000 Subject: [PATCH 364/681] Small Matrix: skylakex: sgemm nn: clean up unused code --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 222 ------------------ 1 file changed, 222 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index ae4a9daa3..a5c530593 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -63,48 +63,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; #endif - - -#define DECLARE_RESULT_256(M, N) __m256 result##M##N = _mm256_setzero_ps() -#define LOAD_A_256(M, N) __m256 Aval##M = _mm256_loadu_ps(&A[lda * k + i + (M*8)]) -#define BROADCAST_LOAD_B_256(M, N) __m256 Bval##N = _mm256_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) -#define MATMUL_256(M, N) result##M##N = _mm256_fmadd_ps(Aval##M, Bval##N, result##M##N) -#if defined(B0) -#define STORE_256(M, N) result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ - _mm256_storeu_ps(&C[(j+N)*ldc + i + (M*8)], result##M##N) -#else -#define STORE_256(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*8); \ - result##M##N = _mm256_mul_ps(result##M##N, alpha_256); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_256)); \ - _mm256_storeu_ps(&C[offset##M##N], result##M##N) -#endif - -#define DECLARE_RESULT_128(M, N) __m128 result##M##N; asm("vpxorq %0, %0, %0": "+v"(result##M##N):) -#define LOAD_A_128(M, N) __m128 Aval##M = _mm_maskz_loadu_ps(mask, &A[lda * k + i + (M*4)]) -#define BROADCAST_LOAD_B_128(M, N) __m128 Bval##N = _mm_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) -#define MATMUL_128(M, N) result##M##N = _mm_fmadd_ps(Aval##M, Bval##N, result##M##N) -#if defined(B0) -#define STORE_128(M, N) result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ - _mm_mask_storeu_ps(&C[(j+N)*ldc + i + (M*4)], mask, result##M##N) -#else -#define STORE_128(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*4); \ - result##M##N = _mm_maskz_mul_ps(mask, result##M##N, alpha_128); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_128)); \ - _mm_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) -#endif - -#define DECLARE_RESULT_S(M, N) float result##M##N = 0; -#define LOAD_A_S(M, N) float Aval##M = A[lda * k + i + M] -#define BROADCAST_LOAD_B_S(M, N) float Bval##N = B[k + ldb * (j+N)] -#define MATMUL_S(M, N) result##M##N += Aval##M * Bval##N -#if defined(B0) -#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha -#else -#define STORE_S(M, N) C[(j+N)*ldc + i + M] = result##M##N * alpha + C[(j+N)*ldc + i + M] * beta -#endif - #if defined(B0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else @@ -594,184 +552,4 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp free(mbuf); return; } - __m256 alpha_256 = _mm256_broadcastss_ps(_mm_load_ss(&alpha)); -#if !defined(B0) - __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); -#endif - for (; i < m8; i += 8) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - DECLARE_RESULT_256(0, 2); - DECLARE_RESULT_256(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); - BROADCAST_LOAD_B_256(x, 2); BROADCAST_LOAD_B_256(x, 3); - - MATMUL_256(0, 0); - MATMUL_256(0, 1); - MATMUL_256(0, 2); - MATMUL_256(0, 3); - } - STORE_256(0, 0); - STORE_256(0, 1); - STORE_256(0, 2); - STORE_256(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_256(0, 0); - DECLARE_RESULT_256(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); BROADCAST_LOAD_B_256(x, 1); - MATMUL_256(0, 0); - MATMUL_256(0, 1); - } - STORE_256(0, 0); - STORE_256(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_256(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_256(0, x); - BROADCAST_LOAD_B_256(x, 0); - MATMUL_256(0, 0); - } - STORE_256(0, 0); - } - } - __m128 alpha_128 = _mm_broadcastss_ps(_mm_load_ss(&alpha)); -#if !defined(B0) - __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); -#endif - for (; i < m4; i += 4) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - DECLARE_RESULT_128(0, 2); - DECLARE_RESULT_128(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); - BROADCAST_LOAD_B_128(x, 2); BROADCAST_LOAD_B_128(x, 3); - - MATMUL_128(0, 0); - MATMUL_128(0, 1); - MATMUL_128(0, 2); - MATMUL_128(0, 3); - } - STORE_128(0, 0); - STORE_128(0, 1); - STORE_128(0, 2); - STORE_128(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_128(0, 0); - DECLARE_RESULT_128(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); BROADCAST_LOAD_B_128(x, 1); - MATMUL_128(0, 0); - MATMUL_128(0, 1); - } - STORE_128(0, 0); - STORE_128(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_128(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_128(0, x); - BROADCAST_LOAD_B_128(x, 0); - MATMUL_128(0, 0); - } - STORE_128(0, 0); - } - } - for (; i < m2; i += 2) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); - DECLARE_RESULT_S(0, 2); DECLARE_RESULT_S(1, 2); - DECLARE_RESULT_S(0, 3); DECLARE_RESULT_S(1, 3); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); - - MATMUL_S(0, 0); MATMUL_S(1, 0); - MATMUL_S(0, 1); MATMUL_S(1, 1); - MATMUL_S(0, 2); MATMUL_S(1, 2); - MATMUL_S(0, 3); MATMUL_S(1, 3); - } - STORE_S(0, 0); STORE_S(1, 0); - STORE_S(0, 1); STORE_S(1, 1); - STORE_S(0, 2); STORE_S(1, 2); - STORE_S(0, 3); STORE_S(1, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - DECLARE_RESULT_S(0, 1); DECLARE_RESULT_S(1, 1); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - MATMUL_S(0, 0); MATMUL_S(1, 0); - MATMUL_S(0, 1); MATMUL_S(1, 1); - } - STORE_S(0, 0); STORE_S(1, 0); - STORE_S(0, 1); STORE_S(1, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_S(0, 0); DECLARE_RESULT_S(1, 0); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); - MATMUL_S(0, 0); MATMUL_S(1, 0); - } - STORE_S(0, 0); STORE_S(1, 0); - } - } - for (; i < M; i += 1) { - for (j = 0; j < n4; j += 4) { - DECLARE_RESULT_S(0, 0); - DECLARE_RESULT_S(0, 1); - DECLARE_RESULT_S(0, 2); - DECLARE_RESULT_S(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - BROADCAST_LOAD_B_S(x, 2); BROADCAST_LOAD_B_S(x, 3); - - MATMUL_S(0, 0); - MATMUL_S(0, 1); - MATMUL_S(0, 2); - MATMUL_S(0, 3); - } - STORE_S(0, 0); - STORE_S(0, 1); - STORE_S(0, 2); - STORE_S(0, 3); - } - for (; j < n2; j += 2) { - DECLARE_RESULT_S(0, 0); - DECLARE_RESULT_S(0, 1); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); - BROADCAST_LOAD_B_S(x, 0); BROADCAST_LOAD_B_S(x, 1); - MATMUL_S(0, 0); - MATMUL_S(0, 1); - } - STORE_S(0, 0); - STORE_S(0, 1); - } - for (; j < N; j++) { - DECLARE_RESULT_S(0, 0); - for (k = 0; k < K; k++) { - LOAD_A_S(0, x); LOAD_A_S(1, x); - BROADCAST_LOAD_B_S(x, 0); - MATMUL_S(0, 0); - } - STORE_S(0, 0); - } - } } From 13b32f69b78b15e7d95978011ea6c2bb3d9e3642 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 12 May 2021 17:08:18 +0000 Subject: [PATCH 365/681] Small Matrix: skylakex: sgemm nn: reduce store 4 M at a time --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 64 ++++++++++++++----- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index a5c530593..be9f085c0 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -57,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) +#define REDUCE_M4(N) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \ + r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); #if defined(B0) #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} #else #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} #endif #if defined(B0) @@ -75,14 +95,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m64 = M & ~63; BLASLONG m32 = M & ~31; BLASLONG m16 = M & ~15; - BLASLONG m8 = M & ~7; BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; BLASLONG n4 = N & ~3; BLASLONG n2 = N & ~1; - __mmask8 mask = 0xff; // just use to avoid SSE instruction __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); #if !defined(B0) @@ -220,8 +238,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 0); } } - if (M - i > 8) { - register __mmask16 mask asm("k1") = (1UL << (M - i)) - 1; + int mm = M - i; + if (!mm) return 0; + if (mm > 8 || K < 32) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -263,10 +283,20 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } MASK_STORE_512(0, 0); } - return; - } - int mm = M - i; - if (mm) { + } else { + /* M => [1, 8] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 16 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); __mmask8 mask8 = (1UL << mm) - 1; __mmask16 mask; @@ -328,6 +358,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } int mi = 0; + mask8 = 0xff; // just use to avoid SSE instruction + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -354,10 +389,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); - STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); STORE_REDUCE(2, 2); STORE_REDUCE(3, 2); - STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); STORE_REDUCE(2, 3); STORE_REDUCE(3, 3); + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -378,9 +410,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); STORE_REDUCE(2, 1); STORE_REDUCE(3, 1); - + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); } for (; j < N; j += 1) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -398,7 +428,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); STORE_REDUCE(2, 0); STORE_REDUCE(3, 0); + STORE_REDUCE_M4(0); } } @@ -550,6 +580,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } free(mbuf); - return; } + return 0; } From 4c9d9940fdd6a458289a02e850afd65d5b9689ba Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 13 May 2021 09:41:51 +0000 Subject: [PATCH 366/681] Small Matrix: skylakex: sgemm nn: reduce store 4 N at a time --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index be9f085c0..c9f43f9a2 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -57,10 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) #define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) #define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) -#define REDUCE_M4(N) \ +#define REDUCE_4(rr0, rr1, rr2, rr3) \ __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ - r0 = _mm512_unpacklo_ps(result0##N, result1##N); r1 = _mm512_unpackhi_ps(result0##N, result1##N); \ - r2 = _mm512_unpacklo_ps(result2##N, result3##N); r3 = _mm512_unpackhi_ps(result2##N, result3##N); \ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ @@ -68,12 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) #if defined(B0) #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); #define STORE_REDUCE_M4(N) {\ REDUCE_M4(N) \ _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ } +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} #else #define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; #define STORE_REDUCE_M4(N) {\ @@ -81,6 +87,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ } +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \ + s0 = _mm_fmadd_ps(s1, beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} #endif #if defined(B0) @@ -363,6 +375,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp #if !defined(B0) __m128 beta_128 = _mm_broadcast_ss(&beta); #endif + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -458,10 +471,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(0, 3); MATMUL_512(1, 3); } - STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); - STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); - STORE_REDUCE(0, 2); STORE_REDUCE(1, 2); - STORE_REDUCE(0, 3); STORE_REDUCE(1, 3); + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); @@ -532,10 +542,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MATMUL_512(0, 2); MATMUL_512(0, 3); } - STORE_REDUCE(0, 0); - STORE_REDUCE(0, 1); - STORE_REDUCE(0, 2); - STORE_REDUCE(0, 3); + STORE_REDUCE_N4(0); } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); From a87736346fd3988618c0d8895827566fce5a5487 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 13 May 2021 10:16:54 +0000 Subject: [PATCH 367/681] Small Matrix: skylakex: sgemm nn: add n6 to improve performance --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 90 ++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index c9f43f9a2..a67541161 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -110,6 +110,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; + BLASLONG n6 = N - (N % 6); BLASLONG n4 = N & ~3; BLASLONG n2 = N & ~1; @@ -165,7 +166,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } for (; i < m32; i += 32) { - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (;j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); @@ -208,7 +236,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } for (; i < m16; i += 16) { - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(0, 2); @@ -228,6 +283,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 2); STORE_512(0, 3); } + for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -254,26 +310,54 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp if (!mm) return 0; if (mm > 8 || K < 32) { register __mmask16 mask asm("k1") = (1UL << mm) - 1; - for (j = 0; j < n4; j += 4) { + for (j = 0; j < n6; j += 6) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); for (k = 0; k < K; k++) { MASK_LOAD_A_512(0, x); BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); MATMUL_512(0, 0); MATMUL_512(0, 1); MATMUL_512(0, 2); MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); } MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); From 9967e61abb3ba0b87a043662382c515ed9d220bb Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 19 May 2021 10:50:03 +0000 Subject: [PATCH 368/681] Small Matrix: skylakex: sgemm nn: fix error when beta not zero --- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index a67541161..99856d0af 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -42,15 +42,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #else #define STORE_512(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512)); \ - _mm512_storeu_ps(&C[offset##M##N], result##M##N) + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) \ - BLASLONG offset##M##N = (j+N)*ldc + i + (M*16); \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ - asm("vfmadd231ps (%1, %2, 4), %3, %0 %{%4%}": "+v"(result##M##N):"r"(&C), "r"(offset##M##N), "v"(beta_512), "k"(mask)); \ - _mm512_mask_storeu_ps(&C[offset##M##N], mask, result##M##N) + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) #endif #define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]); From ca7682e3a3dceeb52ba1ad554f384388ffb24c9a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 20 May 2021 11:24:31 +0000 Subject: [PATCH 369/681] Small Matrix: skylakex: sgemm nn: fix n6 conflicts with n4 --- .../x86_64/sgemm_small_kernel_nn_skylakex.c | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 99856d0af..9bc7a7c58 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -191,26 +191,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 4); STORE_512(1, 4); STORE_512(0, 5); STORE_512(1, 5); } - for (;j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); - DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); - DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); - DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); - for (k = 0; k < K; k++) { - LOAD_A_512(0, x); LOAD_A_512(1, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); MATMUL_512(1, 0); - MATMUL_512(0, 1); MATMUL_512(1, 1); - MATMUL_512(0, 2); MATMUL_512(1, 2); - MATMUL_512(0, 3); MATMUL_512(1, 3); - } - STORE_512(0, 0); STORE_512(1, 0); - STORE_512(0, 1); STORE_512(1, 1); - STORE_512(0, 2); STORE_512(1, 2); - STORE_512(0, 3); STORE_512(1, 3); - } for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); @@ -261,27 +241,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp STORE_512(0, 4); STORE_512(0, 5); } - for (; j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - for (k = 0; k < K; k++) { - LOAD_A_512(0, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - STORE_512(0, 0); - STORE_512(0, 1); - STORE_512(0, 2); - STORE_512(0, 3); - } - for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); @@ -335,27 +294,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); } - for (; j < n4; j += 4) { - DECLARE_RESULT_512(0, 0); - DECLARE_RESULT_512(0, 1); - DECLARE_RESULT_512(0, 2); - DECLARE_RESULT_512(0, 3); - for (k = 0; k < K; k++) { - MASK_LOAD_A_512(0, x); - BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); - BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); - - MATMUL_512(0, 0); - MATMUL_512(0, 1); - MATMUL_512(0, 2); - MATMUL_512(0, 3); - } - MASK_STORE_512(0, 0); - MASK_STORE_512(0, 1); - MASK_STORE_512(0, 2); - MASK_STORE_512(0, 3); - } - for (; j < n2; j += 2) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(0, 1); From 0d72d75bf9455c91b6f0c4ecf5b7555845dccf6f Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 20 May 2021 11:47:10 +0000 Subject: [PATCH 370/681] Small Matrix: skylakex: add sgemm nt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_nt_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_nt_skylakex.c | 366 ++++++++++++++++++ 3 files changed, 370 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 1a2e67b52..d3560bf80 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -12,6 +12,8 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c new file mode 100644 index 000000000..6d7934be1 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..3fc842669 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,366 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm > 0) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } +} From ae3f5c737c24e6fdb7de4559969bee5631aa1683 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 21 May 2021 13:31:31 +0000 Subject: [PATCH 371/681] Small Matrix: skylakex: sgemm nt: optimize for M < 12 --- .../x86_64/sgemm_small_kernel_nt_skylakex.c | 171 +++++++++++++++++- 1 file changed, 170 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index 3fc842669..f293bf9f9 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -35,11 +35,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) #define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N])) #define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) #if defined(B0) #define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) #define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4) #else #define STORE_512(M, N) \ result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ @@ -49,6 +57,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); #endif #if defined(B0) @@ -66,6 +82,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp BLASLONG m4 = M & ~3; BLASLONG m2 = M & ~1; + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; BLASLONG n8 = N & ~7; BLASLONG n6 = N - (N % 6); BLASLONG n4 = N & ~3; @@ -284,7 +302,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } } int mm = M - i; - if (mm > 0) { + if (mm >= 12) { register __mmask16 mask asm("k1") = (1UL << mm) - 1; for (j = 0; j < n8; j += 8) { DECLARE_RESULT_512(0, 0); @@ -362,5 +380,156 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } MASK_STORE_512(0, 0); } + } else if (mm > 0) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi32(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } } + return 0; } From 642c3938790b45606dea7450a6fbc23b6c9b9b9c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 26 May 2021 16:30:57 +0000 Subject: [PATCH 372/681] Small Matrix: skylakex: add sgemm tn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_tn_skylakex.c | 2 + .../x86_64/sgemm_small_kernel_tn_skylakex.c | 316 ++++++++++++++++++ 3 files changed, 320 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_tn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d3560bf80..5e0d9e5b4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,6 +14,8 @@ SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c new file mode 100644 index 000000000..0f9745b72 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..5a9a4ea32 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,316 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) +#define STORE_M4(N, s0) _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); +#define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); + +#define STORE_N4(M, s0) \ + s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k16 = K & ~15; + + __mmask16 mask; + __mmask8 mask8 = 0xff; // just use to avoid SSE instruction + + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} From 5dc7c3c8e572c1760cd9aba40dde1db54bb3f2e3 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 27 May 2021 11:03:56 +0000 Subject: [PATCH 373/681] Small Matrix: add GEMM_SMALL_MATRIX_PERMIT to tune small matrics case --- common_c.h | 2 ++ common_d.h | 1 + common_level3.h | 8 +++++ common_macro.h | 10 ++++++ common_s.h | 2 ++ common_z.h | 2 ++ interface/gemm.c | 9 +++--- kernel/Makefile.L3 | 31 ++++++++++++++++++ kernel/generic/gemm_small_matrix_permit.c | 37 ++++++++++++++++++++++ kernel/generic/zgemm_small_matrix_permit.c | 37 ++++++++++++++++++++++ 10 files changed, 135 insertions(+), 4 deletions(-) create mode 100644 kernel/generic/gemm_small_matrix_permit.c create mode 100644 kernel/generic/zgemm_small_matrix_permit.c diff --git a/common_c.h b/common_c.h index 9388ece93..dc273eef0 100644 --- a/common_c.h +++ b/common_c.h @@ -232,6 +232,8 @@ #define CGEADD_K cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit + #define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn #define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt #define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr diff --git a/common_d.h b/common_d.h index 42c14e828..bb85f1232 100644 --- a/common_d.h +++ b/common_d.h @@ -157,6 +157,7 @@ #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit #define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn #define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt diff --git a/common_level3.h b/common_level3.h index a3a487dab..187402a9a 100644 --- a/common_level3.h +++ b/common_level3.h @@ -516,11 +516,15 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd #endif #ifdef SMALL_MATRIX_OPT +int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); @@ -536,6 +540,8 @@ int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLA int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); @@ -556,6 +562,8 @@ int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLON int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); diff --git a/common_macro.h b/common_macro.h index 2cccf9b39..aeb9a205b 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,6 +644,8 @@ #define GEADD_K DGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN @@ -940,6 +942,8 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN @@ -1256,6 +1260,8 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN @@ -2093,6 +2099,8 @@ #define GEADD_K ZGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR @@ -2556,6 +2564,8 @@ #define GEADD_K CGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT + #define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN #define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT #define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR diff --git a/common_s.h b/common_s.h index 685d73062..5851014cf 100644 --- a/common_s.h +++ b/common_s.h @@ -164,6 +164,8 @@ #define SGEADD_K sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit + #define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn #define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt #define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn diff --git a/common_z.h b/common_z.h index 8594ec74d..6088260a1 100644 --- a/common_z.h +++ b/common_z.h @@ -232,6 +232,8 @@ #define ZGEADD_K zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit + #define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn #define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt #define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr diff --git a/interface/gemm.c b/interface/gemm.c index 7251993ee..ad8780668 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -464,25 +464,26 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #ifdef SMALL_MATRIX_OPT - //need to tune small matrices cases. - if(MNK <= 100.0*100.0*100.0){ - #if !defined(COMPLEX) + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } + return; + } #else + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ if(beta[0] == 0.0 && beta[1] == 0.0){ (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } -#endif return; } #endif +#endif buffer = (XFLOAT *)blas_memory_alloc(0); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1c4a00158..f977793a0 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -451,18 +451,21 @@ endif ifeq ($(SMALL_MATRIX_OPT), 1) SBLASOBJS += \ + sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ + dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ + cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ @@ -481,6 +484,7 @@ CBLASOBJS += \ cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ + zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ @@ -4294,6 +4298,10 @@ $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) ###### BLAS small matrix optimization ##### +ifndef DGEMM_SMALL_M_PERMIT +DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + ifndef DGEMM_SMALL_K_NN DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c endif @@ -4310,6 +4318,9 @@ ifndef DGEMM_SMALL_K_TT DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif +$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -4350,6 +4361,9 @@ $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +ifndef SGEMM_SMALL_M_PERMIT +SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif ifndef SGEMM_SMALL_K_NN SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c @@ -4367,6 +4381,9 @@ ifndef SGEMM_SMALL_K_TT SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c endif +$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -4407,6 +4424,9 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifndef CGEMM_SMALL_M_PERMIT +CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif ifndef CGEMM_SMALL_K_NN CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c @@ -4424,6 +4444,9 @@ ifndef CGEMM_SMALL_K_TT CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif +$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + $(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -4536,6 +4559,10 @@ $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +ifndef ZGEMM_SMALL_M_PERMIT +ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + ifndef ZGEMM_SMALL_K_NN ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif @@ -4552,6 +4579,10 @@ ifndef ZGEMM_SMALL_K_TT ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif +$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + + $(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c new file mode 100644 index 000000000..6e1ab1fc1 --- /dev/null +++ b/kernel/generic/gemm_small_matrix_permit.c @@ -0,0 +1,37 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK <= 100.0*100.0*100.0) + return 1; + else + return 0; +} diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c new file mode 100644 index 000000000..288937256 --- /dev/null +++ b/kernel/generic/zgemm_small_matrix_permit.c @@ -0,0 +1,37 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK <= 100.0*100.0*100.0) + return 1; + else + return 0; +} From 02c6e764f2e94779ae5699ca2ea8c2189aa9fa02 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 27 May 2021 11:26:49 +0000 Subject: [PATCH 374/681] Small Matrix: skylakex: add SGEMM_SMALL_M_PERMIT and tune for TN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 1 + .../sgemm_small_kernel_permit_skylakex.c | 50 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 kernel/x86_64/sgemm_small_kernel_permit_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 5e0d9e5b4..264e3a9f4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -10,6 +10,7 @@ STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..159ae10b5 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,50 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + // tuning for A transpose + if (transa) { + if (transb) { + return 0; // TT kernel not support yet + } else { // TN kernel + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + } + + return 1; +} From 72e070539cd13364c8a02ac34e3dfcd65b657c7a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 31 May 2021 14:53:03 +0000 Subject: [PATCH 375/681] Small Matrix: skylakex: add sgemm tt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../sgemm_small_kernel_b0_tt_skylakex.c | 3 + .../sgemm_small_kernel_permit_skylakex.c | 7 +- .../x86_64/sgemm_small_kernel_tt_skylakex.c | 414 ++++++++++++++++++ 4 files changed, 424 insertions(+), 2 deletions(-) create mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c create mode 100644 kernel/x86_64/sgemm_small_kernel_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 264e3a9f4..0f58a4d46 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -17,6 +17,8 @@ SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c new file mode 100644 index 000000000..27d9e0afd --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c @@ -0,0 +1,3 @@ +#define B0 1 +#define TT 1 +#include "./sgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c index 159ae10b5..cbf2374bd 100644 --- a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c @@ -35,8 +35,11 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph // tuning for A transpose if (transa) { if (transb) { - return 0; // TT kernel not support yet - } else { // TN kernel + /* TT kernel perform not good when: + * 1. K is too small. + */ + if (K < 4) return 0; + } else { /* TN kernel perform not good when: * 1. C matrix is too big * 2. K is too small diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..8da560ef7 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,414 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*8)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*4)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#endif + +#define REORDER_8x16(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512 t0, t1, t2, t3, t4, t5, t6, t7, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + t4 = _mm512_unpacklo_ps(r4, r5); \ + t5 = _mm512_unpackhi_ps(r4, r5); \ + t6 = _mm512_unpacklo_ps(r6, r7); \ + t7 = _mm512_unpackhi_ps(r6, r7); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + v = _mm512_shuffle_ps(t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_ps(kc, t4, v); \ + r5 = _mm512_mask_blend_ps(k3, t6, v); \ + v = _mm512_shuffle_ps(t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_ps(kc, t5, v); \ + r7 = _mm512_mask_blend_ps(k3, t7, v); \ + t0 = _mm512_permutex2var_ps(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_ps(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_ps(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_ps(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_ps(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_ps(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_ps(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_ps(r3, idx_hi, r7); \ + t0 = _mm512_mul_ps(t0, alpha_512); \ + t1 = _mm512_mul_ps(t1, alpha_512); \ + t2 = _mm512_mul_ps(t2, alpha_512); \ + t3 = _mm512_mul_ps(t3, alpha_512); \ + t4 = _mm512_mul_ps(t4, alpha_512); \ + t5 = _mm512_mul_ps(t5, alpha_512); \ + t6 = _mm512_mul_ps(t6, alpha_512); \ + t7 = _mm512_mul_ps(t7, alpha_512); + +#define SAVE_8(N, x, y) {\ + __m256 v8 = _mm512_extractf32x8_ps(t##x, y); \ + STORE_8xy(v8, N, x, y); \ +} + +#define REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0, 0); SAVE_8(N, 1, 0); SAVE_8(N, 2, 0); SAVE_8(N, 3, 0); SAVE_8(N, 4, 0); SAVE_8(N, 5, 0); SAVE_8(N, 6, 0); SAVE_8(N, 7, 0); \ + SAVE_8(N, 0, 1); SAVE_8(N, 1, 1); SAVE_8(N, 2, 1); SAVE_8(N, 3, 1); SAVE_8(N, 4, 1); SAVE_8(N, 5, 1); SAVE_8(N, 6, 1); SAVE_8(N, 7, 1); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 16: SAVE_8(0, 7, 1); \ + case 15: SAVE_8(0, 6, 1); \ + case 14: SAVE_8(0, 5, 1); \ + case 13: SAVE_8(0, 4, 1); \ + case 12: SAVE_8(0, 3, 1); \ + case 11: SAVE_8(0, 2, 1); \ + case 10: SAVE_8(0, 1, 1); \ + case 9: SAVE_8(0, 0, 1); \ + case 8: SAVE_8(0, 7, 0); \ + case 7: SAVE_8(0, 6, 0); \ + case 6: SAVE_8(0, 5, 0); \ + case 5: SAVE_8(0, 4, 0); \ + case 4: SAVE_8(0, 3, 0); \ + case 3: SAVE_8(0, 2, 0); \ + case 2: SAVE_8(0, 1, 0); \ + case 1: SAVE_8(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x16(r0, r1, r2, r3) \ + __m512 t0, t1, t2, t3, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + t0 = _mm512_mul_ps(r0, alpha_512); \ + t1 = _mm512_mul_ps(r1, alpha_512); \ + t2 = _mm512_mul_ps(r2, alpha_512); \ + t3 = _mm512_mul_ps(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m128 v4 = _mm512_extractf32x4_ps(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ + SAVE_4(N, 0, 2); SAVE_4(N, 1, 2); SAVE_4(N, 2, 2); SAVE_4(N, 3, 2); \ + SAVE_4(N, 0, 3); SAVE_4(N, 1, 3); SAVE_4(N, 2, 3); SAVE_4(N, 3, 3); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 16: SAVE_4(0, 3, 3); \ + case 15: SAVE_4(0, 2, 3); \ + case 14: SAVE_4(0, 1, 3); \ + case 13: SAVE_4(0, 0, 3); \ + case 12: SAVE_4(0, 3, 2); \ + case 11: SAVE_4(0, 2, 2); \ + case 10: SAVE_4(0, 1, 2); \ + case 9: SAVE_4(0, 0, 2); \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); + __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); +#endif + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo = _mm512_loadu_epi32(permute_table); + __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x16(0); + REORDER_STORE_8x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x16(0); + } + } + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + REORDER_STORE_4x16(2); + REORDER_STORE_4x16(3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x16(0); + } + } + if (i < M) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi32(index_n); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 91ec21202bd8ae81f15dae79e004b2f00d20e559 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 1 Jun 2021 11:31:50 +0000 Subject: [PATCH 376/681] Small Matrix: skylakex: add dgemm nn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_nn_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_nn_skylakex.c | 590 ++++++++++++++++++ 3 files changed, 594 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 0f58a4d46..a3c6f0556 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,6 +27,8 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c new file mode 100644 index 000000000..a58738a25 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..8ffb899c8 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,590 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_pd1(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#endif + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \ + s0 = _mm256_fmadd_pd(s1, beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (!mm) return 0; + if (mm > 4 || K < 16) { + register __mmask8 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else { + /* M => [1, 4] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 8 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask = (1UL << mm) - 1; + BLASLONG k8 = K & ~7; + BLASLONG k4 = K & ~3; + for (k = 0; k < k4; k += 4) { + __m256d r0, r1, r2, r3; + __m256d t0, t1, t2, t3; + r0 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(3 + k)]); + + t0 = _mm256_unpacklo_pd(r0, r1); + t1 = _mm256_unpackhi_pd(r0, r1); + t2 = _mm256_unpacklo_pd(r2, r3); + t3 = _mm256_unpackhi_pd(r2, r3); + + r0 = _mm256_permute2f128_pd(t0, t2, 0x20); + r1 = _mm256_permute2f128_pd(t1, t3, 0x20); + r2 = _mm256_permute2f128_pd(t0, t2, 0x31); + r3 = _mm256_permute2f128_pd(t1, t3, 0x31); + + switch (mm) { + case 4: _mm256_storeu_pd(&mbuf[k + 3*K], r3); + case 3: _mm256_storeu_pd(&mbuf[k + 2*K], r2); + case 2: _mm256_storeu_pd(&mbuf[k + 1*K], r1); + case 1: _mm256_storeu_pd(&mbuf[k + 0*K], r0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0); + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi32(permute_table); + __m512i idx_hi = _mm512_loadu_epi32(permute_table + 8); + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + } + return 0; +} From f57fc932ac39c394e8f89bf7b6df3f1bddd315fd Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 1 Jun 2021 14:23:56 +0000 Subject: [PATCH 377/681] Small Matrix: skylakex: add dgemm nt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_nt_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_nt_skylakex.c | 535 ++++++++++++++++++ 3 files changed, 539 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index a3c6f0556..db1e6cbff 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -29,6 +29,8 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c new file mode 100644 index 000000000..eafe2ce49 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..0a95a68e2 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_sd(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm >= 6) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else if (mm > 0) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi64(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 323d7da4f7c21b0a285af1527a47799c4adf69f4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 11:45:44 +0000 Subject: [PATCH 378/681] Small Matrix: skylakex: add dgemm tt kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_tt_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_tt_skylakex.c | 392 ++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index db1e6cbff..3e84e794e 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c new file mode 100644 index 000000000..93fab1836 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..8ff79d2c8 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,392 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*8)*ldc + i]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*4)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#define REORDER_8x8(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512d t0, t1, t2, t3, t4, t5, t6, t7; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + t4 = _mm512_unpacklo_pd(r4, r5); \ + t5 = _mm512_unpackhi_pd(r4, r5); \ + t6 = _mm512_unpacklo_pd(r6, r7); \ + t7 = _mm512_unpackhi_pd(r6, r7); \ + r0 = _mm512_shuffle_f64x2(t0, t2, 0x88); \ + r1 = _mm512_shuffle_f64x2(t1, t3, 0x88); \ + r2 = _mm512_shuffle_f64x2(t0, t2, 0xdd); \ + r3 = _mm512_shuffle_f64x2(t1, t3, 0xdd); \ + r4 = _mm512_shuffle_f64x2(t4, t6, 0x88); \ + r5 = _mm512_shuffle_f64x2(t5, t7, 0x88); \ + r6 = _mm512_shuffle_f64x2(t4, t6, 0xdd); \ + r7 = _mm512_shuffle_f64x2(t5, t7, 0xdd); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_pd(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_pd(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_pd(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_pd(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_pd(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_pd(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_pd(r3, idx_hi, r7); \ + t0 = _mm512_mul_pd(t0, alpha_512); \ + t1 = _mm512_mul_pd(t1, alpha_512); \ + t2 = _mm512_mul_pd(t2, alpha_512); \ + t3 = _mm512_mul_pd(t3, alpha_512); \ + t4 = _mm512_mul_pd(t4, alpha_512); \ + t5 = _mm512_mul_pd(t5, alpha_512); \ + t6 = _mm512_mul_pd(t6, alpha_512); \ + t7 = _mm512_mul_pd(t7, alpha_512); + +#define SAVE_8(N, x) {\ + STORE_8xy(t##x, N, x, 0); \ +} + +#define REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0); SAVE_8(N, 1); SAVE_8(N, 2); SAVE_8(N, 3); SAVE_8(N, 4); SAVE_8(N, 5); SAVE_8(N, 6); SAVE_8(N, 7); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 8: SAVE_8(0, 7); \ + case 7: SAVE_8(0, 6); \ + case 6: SAVE_8(0, 5); \ + case 5: SAVE_8(0, 4); \ + case 4: SAVE_8(0, 3); \ + case 3: SAVE_8(0, 2); \ + case 2: SAVE_8(0, 1); \ + case 1: SAVE_8(0, 0); \ + } + +#define MASK_REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) \ + __m512d t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + r0 = _mm512_permutex2var_pd(t0, idx_lo, t2); \ + r1 = _mm512_permutex2var_pd(t1, idx_lo, t3); \ + r2 = _mm512_permutex2var_pd(t0, idx_hi, t2); \ + r3 = _mm512_permutex2var_pd(t1, idx_hi, t3); \ + t0 = _mm512_mul_pd(r0, alpha_512); \ + t1 = _mm512_mul_pd(r1, alpha_512); \ + t2 = _mm512_mul_pd(r2, alpha_512); \ + t3 = _mm512_mul_pd(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m256d v4 = _mm512_extractf64x4_pd(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); + __m256d beta_256 = _mm256_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + long long permute_table[] = { + 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, + 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x8(0); + REORDER_STORE_8x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x8(0); + } + } + for (; i < m4; i += 4) { + long long permute_table2[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + idx_lo = _mm512_loadu_epi64(permute_table2); + idx_hi = _mm512_loadu_epi64(permute_table2 + 8); + + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + REORDER_STORE_4x8(2); + REORDER_STORE_4x8(3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x8(0); + } + } + if (i < M) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_epi64(index_n); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} From 3e79f6d89abe60b75a4a504670a676472b2d0918 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 13:56:40 +0000 Subject: [PATCH 379/681] Small Matrix: skylakex: add dgemm tn kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 2 + .../dgemm_small_kernel_b0_tn_skylakex.c | 2 + .../x86_64/dgemm_small_kernel_tn_skylakex.c | 322 ++++++++++++++++++ 3 files changed, 326 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c create mode 100644 kernel/x86_64/dgemm_small_kernel_tn_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 3e84e794e..c1d8f8e89 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -31,6 +31,8 @@ DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c new file mode 100644 index 000000000..1dfa0aaf1 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./dgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..0881f35b2 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,322 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) +#define STORE_M4(N, s0) _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); +#define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); + +#define STORE_N4(M, s0) \ + s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k8 = K & ~7; + + __mmask8 mask; + + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0); + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} From 8592c21af4d6328068b87f402a6801b30e2aebec Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 13:57:39 +0000 Subject: [PATCH 380/681] Small Matrix: skylakex: dgemm nn: fix typo in idx load --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index 8ffb899c8..ff2a04beb 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi32(permute_table); - __m512i idx_hi = _mm512_loadu_epi32(permute_table + 8); + __m512i idx_lo = _mm512_loadu_epi64(permute_table); + __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); From fa777f5517d4b43acfda8b8a58649af94c1e40b4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 2 Jun 2021 14:55:54 +0000 Subject: [PATCH 381/681] Small Matrix: skylakex: add DGEMM_SMALL_M_PERMIT and tune for TN kernel --- kernel/x86_64/KERNEL.SKYLAKEX | 1 + .../dgemm_small_kernel_permit_skylakex.c | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 kernel/x86_64/dgemm_small_kernel_permit_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index c1d8f8e89..eb0cbaf98 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -27,6 +27,7 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..9cca08e71 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,44 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + if (transa && !transb) { + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + return 1; +} From 210a1584c5299d8e53129b4e2a8b73b67046cc77 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Aug 2021 14:19:16 +0200 Subject: [PATCH 382/681] Rebase source and edit TLS version of the message as well --- driver/others/memory.c | 46 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..500ec22c5 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -222,11 +222,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +436,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -460,7 +460,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1619,10 +1619,12 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; + #ifdef _WIN64 +static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif -static int(*p_process_term)(void) = on_process_term; + #ifdef _WIN64 +static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else +static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#else +#elif !defined(OS_EMBEDDED) #define ALLOC_MMAP #define ALLOC_MALLOC +#else +#define ALLOC_MALLOC + +inline int puts(const char *str) { return 0; } +inline int printf(const char *format, ...) { return 0; } +inline char *getenv(const char *name) { return ""; } +inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -1691,7 +1702,6 @@ void gotoblas_dummy_for_PGI(void) { #include #include #include -#include #include #include #include @@ -1969,7 +1979,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -1977,7 +1987,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -2001,7 +2011,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2868,8 +2878,12 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); - + printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } From 898212efcda215ccab3b46b4a645c8eda2ca7948 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 2 Aug 2021 14:50:14 +0200 Subject: [PATCH 383/681] Actually add the message to the TLS section --- driver/others/memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 500ec22c5..460a3d557 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock); return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: - printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } From 6b58bca18b427a0c149d25542a5eb7c5ada6a19f Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 15 Jun 2021 16:09:51 +0000 Subject: [PATCH 384/681] Small Matrix: disable low performance default kernel --- kernel/generic/gemm_small_matrix_permit.c | 3 +++ kernel/generic/zgemm_small_matrix_permit.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/kernel/generic/gemm_small_matrix_permit.c b/kernel/generic/gemm_small_matrix_permit.c index 6e1ab1fc1..1ae6d2520 100644 --- a/kernel/generic/gemm_small_matrix_permit.c +++ b/kernel/generic/gemm_small_matrix_permit.c @@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) { + return 0; +/* double MNK = (double) M * (double) N * (double) K; if (MNK <= 100.0*100.0*100.0) return 1; else return 0; +*/ } diff --git a/kernel/generic/zgemm_small_matrix_permit.c b/kernel/generic/zgemm_small_matrix_permit.c index 288937256..940ff5dc8 100644 --- a/kernel/generic/zgemm_small_matrix_permit.c +++ b/kernel/generic/zgemm_small_matrix_permit.c @@ -29,9 +29,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1) { + return 0; +/* double MNK = (double) M * (double) N * (double) K; if (MNK <= 100.0*100.0*100.0) return 1; else return 0; +*/ } From 93c8bafff56052534554e3a47e56552c97217228 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 3 Aug 2021 10:45:45 +0200 Subject: [PATCH 385/681] Update Travis badge in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7e0d60a7..88a5a5035 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) From 478d1086c11f28903395bd13050dbca62aec81ef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 4 Aug 2021 03:12:41 +0000 Subject: [PATCH 386/681] Small Matrix: support DYNAMIC_ARCH build --- common_c.h | 83 +++++++++++++++-------------- common_d.h | 23 ++++---- common_param.h | 119 ++++++++++++++++++++++++++++++++++++++++++ common_s.h | 23 ++++---- common_z.h | 83 +++++++++++++++-------------- interface/gemm.c | 50 ++++++++++-------- kernel/setparam-ref.c | 37 +++++++++++++ 7 files changed, 295 insertions(+), 123 deletions(-) diff --git a/common_c.h b/common_c.h index dc273eef0..6cff610bb 100644 --- a/common_c.h +++ b/common_c.h @@ -234,46 +234,6 @@ #define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit -#define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn -#define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt -#define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr -#define CGEMM_SMALL_KERNEL_NC cgemm_small_kernel_nc - -#define CGEMM_SMALL_KERNEL_TN cgemm_small_kernel_tn -#define CGEMM_SMALL_KERNEL_TT cgemm_small_kernel_tt -#define CGEMM_SMALL_KERNEL_TR cgemm_small_kernel_tr -#define CGEMM_SMALL_KERNEL_TC cgemm_small_kernel_tc - -#define CGEMM_SMALL_KERNEL_RN cgemm_small_kernel_rn -#define CGEMM_SMALL_KERNEL_RT cgemm_small_kernel_rt -#define CGEMM_SMALL_KERNEL_RR cgemm_small_kernel_rr -#define CGEMM_SMALL_KERNEL_RC cgemm_small_kernel_rc - -#define CGEMM_SMALL_KERNEL_CN cgemm_small_kernel_cn -#define CGEMM_SMALL_KERNEL_CT cgemm_small_kernel_ct -#define CGEMM_SMALL_KERNEL_CR cgemm_small_kernel_cr -#define CGEMM_SMALL_KERNEL_CC cgemm_small_kernel_cc - -#define CGEMM_SMALL_KERNEL_B0_NN cgemm_small_kernel_b0_nn -#define CGEMM_SMALL_KERNEL_B0_NT cgemm_small_kernel_b0_nt -#define CGEMM_SMALL_KERNEL_B0_NR cgemm_small_kernel_b0_nr -#define CGEMM_SMALL_KERNEL_B0_NC cgemm_small_kernel_b0_nc - -#define CGEMM_SMALL_KERNEL_B0_TN cgemm_small_kernel_b0_tn -#define CGEMM_SMALL_KERNEL_B0_TT cgemm_small_kernel_b0_tt -#define CGEMM_SMALL_KERNEL_B0_TR cgemm_small_kernel_b0_tr -#define CGEMM_SMALL_KERNEL_B0_TC cgemm_small_kernel_b0_tc - -#define CGEMM_SMALL_KERNEL_B0_RN cgemm_small_kernel_b0_rn -#define CGEMM_SMALL_KERNEL_B0_RT cgemm_small_kernel_b0_rt -#define CGEMM_SMALL_KERNEL_B0_RR cgemm_small_kernel_b0_rr -#define CGEMM_SMALL_KERNEL_B0_RC cgemm_small_kernel_b0_rc - -#define CGEMM_SMALL_KERNEL_B0_CN cgemm_small_kernel_b0_cn -#define CGEMM_SMALL_KERNEL_B0_CT cgemm_small_kernel_b0_ct -#define CGEMM_SMALL_KERNEL_B0_CR cgemm_small_kernel_b0_cr -#define CGEMM_SMALL_KERNEL_B0_CC cgemm_small_kernel_b0_cc - #else #define CAMAX_K gotoblas -> camax_k @@ -468,8 +428,51 @@ #define CGEADD_K gotoblas -> cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit + #endif +#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn) +#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt) +#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr) +#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc) + +#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn) +#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt) +#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr) +#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc) + +#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn) +#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt) +#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr) +#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc) + +#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn) +#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct) +#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr) +#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc) + +#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn) +#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt) +#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr) +#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc) + +#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn) +#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt) +#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr) +#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc) + +#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn) +#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt) +#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr) +#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc) + +#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn) +#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct) +#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr) +#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc) + + #define CGEMM_NN cgemm_nn #define CGEMM_CN cgemm_cn #define CGEMM_TN cgemm_tn diff --git a/common_d.h b/common_d.h index bb85f1232..6f4bb2ded 100644 --- a/common_d.h +++ b/common_d.h @@ -159,16 +159,6 @@ #define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit -#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn -#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt -#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn -#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt - -#define DGEMM_SMALL_KERNEL_B0_NN dgemm_small_kernel_b0_nn -#define DGEMM_SMALL_KERNEL_B0_NT dgemm_small_kernel_b0_nt -#define DGEMM_SMALL_KERNEL_B0_TN dgemm_small_kernel_b0_tn -#define DGEMM_SMALL_KERNEL_B0_TT dgemm_small_kernel_b0_tt - #else #define DAMAX_K gotoblas -> damax_k @@ -293,8 +283,21 @@ #define DGEADD_K gotoblas -> dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit + #endif +#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn) +#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt) +#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn) +#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt) + +#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn) +#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt) +#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn) +#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt) + + #define DGEMM_NN dgemm_nn #define DGEMM_CN dgemm_tn #define DGEMM_TN dgemm_tn diff --git a/common_param.h b/common_param.h index 3e3ae06f8..7e8bea4fe 100644 --- a/common_param.h +++ b/common_param.h @@ -207,6 +207,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif #ifdef BUILD_SINGLE +#ifdef SMALL_MATRIX_OPT + int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -314,6 +328,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif #ifdef BUILD_DOUBLE +#ifdef SMALL_MATRIX_OPT + int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + + int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + + int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -513,6 +540,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); +#ifdef SMALL_MATRIX_OPT + int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + + int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -679,6 +750,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); +#ifdef SMALL_MATRIX_OPT + int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + + int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -1069,6 +1184,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); extern gotoblas_t *gotoblas; +#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func) + #define DTB_ENTRIES gotoblas -> dtb_entries #define GEMM_OFFSET_A gotoblas -> offsetA #define GEMM_OFFSET_B gotoblas -> offsetB @@ -1174,6 +1291,8 @@ extern gotoblas_t *gotoblas; #else +#define FUNC_OFFSET(func) (size_t)(func) + #define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A diff --git a/common_s.h b/common_s.h index 5851014cf..fdd80b62f 100644 --- a/common_s.h +++ b/common_s.h @@ -166,16 +166,6 @@ #define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit -#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn -#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt -#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn -#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt - -#define SGEMM_SMALL_KERNEL_B0_NN sgemm_small_kernel_b0_nn -#define SGEMM_SMALL_KERNEL_B0_NT sgemm_small_kernel_b0_nt -#define SGEMM_SMALL_KERNEL_B0_TN sgemm_small_kernel_b0_tn -#define SGEMM_SMALL_KERNEL_B0_TT sgemm_small_kernel_b0_tt - #else #define SAMAX_K gotoblas -> samax_k @@ -311,8 +301,21 @@ #define SGEADD_K gotoblas -> sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit + #endif +#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn) +#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt) +#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn) +#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt) + +#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn) +#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt) +#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn) +#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt) + + #define SGEMM_NN sgemm_nn #define SGEMM_CN sgemm_tn #define SGEMM_TN sgemm_tn diff --git a/common_z.h b/common_z.h index 6088260a1..c12d71b39 100644 --- a/common_z.h +++ b/common_z.h @@ -234,46 +234,6 @@ #define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit -#define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn -#define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt -#define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr -#define ZGEMM_SMALL_KERNEL_NC zgemm_small_kernel_nc - -#define ZGEMM_SMALL_KERNEL_TN zgemm_small_kernel_tn -#define ZGEMM_SMALL_KERNEL_TT zgemm_small_kernel_tt -#define ZGEMM_SMALL_KERNEL_TR zgemm_small_kernel_tr -#define ZGEMM_SMALL_KERNEL_TC zgemm_small_kernel_tc - -#define ZGEMM_SMALL_KERNEL_RN zgemm_small_kernel_rn -#define ZGEMM_SMALL_KERNEL_RT zgemm_small_kernel_rt -#define ZGEMM_SMALL_KERNEL_RR zgemm_small_kernel_rr -#define ZGEMM_SMALL_KERNEL_RC zgemm_small_kernel_rc - -#define ZGEMM_SMALL_KERNEL_CN zgemm_small_kernel_cn -#define ZGEMM_SMALL_KERNEL_CT zgemm_small_kernel_ct -#define ZGEMM_SMALL_KERNEL_CR zgemm_small_kernel_cr -#define ZGEMM_SMALL_KERNEL_CC zgemm_small_kernel_cc - -#define ZGEMM_SMALL_KERNEL_B0_NN zgemm_small_kernel_b0_nn -#define ZGEMM_SMALL_KERNEL_B0_NT zgemm_small_kernel_b0_nt -#define ZGEMM_SMALL_KERNEL_B0_NR zgemm_small_kernel_b0_nr -#define ZGEMM_SMALL_KERNEL_B0_NC zgemm_small_kernel_b0_nc - -#define ZGEMM_SMALL_KERNEL_B0_TN zgemm_small_kernel_b0_tn -#define ZGEMM_SMALL_KERNEL_B0_TT zgemm_small_kernel_b0_tt -#define ZGEMM_SMALL_KERNEL_B0_TR zgemm_small_kernel_b0_tr -#define ZGEMM_SMALL_KERNEL_B0_TC zgemm_small_kernel_b0_tc - -#define ZGEMM_SMALL_KERNEL_B0_RN zgemm_small_kernel_b0_rn -#define ZGEMM_SMALL_KERNEL_B0_RT zgemm_small_kernel_b0_rt -#define ZGEMM_SMALL_KERNEL_B0_RR zgemm_small_kernel_b0_rr -#define ZGEMM_SMALL_KERNEL_B0_RC zgemm_small_kernel_b0_rc - -#define ZGEMM_SMALL_KERNEL_B0_CN zgemm_small_kernel_b0_cn -#define ZGEMM_SMALL_KERNEL_B0_CT zgemm_small_kernel_b0_ct -#define ZGEMM_SMALL_KERNEL_B0_CR zgemm_small_kernel_b0_cr -#define ZGEMM_SMALL_KERNEL_B0_CC zgemm_small_kernel_b0_cc - #else #define ZAMAX_K gotoblas -> zamax_k @@ -468,8 +428,51 @@ #define ZGEADD_K gotoblas -> zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit + #endif +#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn) +#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt) +#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr) +#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc) + +#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn) +#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt) +#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr) +#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc) + +#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn) +#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt) +#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr) +#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc) + +#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn) +#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct) +#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr) +#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc) + +#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn) +#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt) +#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr) +#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc) + +#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn) +#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt) +#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr) +#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc) + +#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn) +#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt) +#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr) +#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc) + +#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn) +#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct) +#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr) +#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc) + + #define ZGEMM_NN zgemm_nn #define ZGEMM_CN zgemm_cn #define ZGEMM_TN zgemm_tn diff --git a/interface/gemm.c b/interface/gemm.c index ad8780668..f4b9f1537 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -106,25 +106,34 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B }; #ifdef SMALL_MATRIX_OPT +#ifndef DYNAMIC_ARCH +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) +#else +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) +#endif + #ifndef COMPLEX -static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { +static size_t gemm_small_kernel[] = { #ifndef GEMM3M - GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL, - GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL, + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, #endif }; -static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { + +static size_t gemm_small_kernel_b0[] = { #ifndef GEMM3M - GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL, - GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL, + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, #endif }; +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) #else -static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = { +static size_t zgemm_small_kernel[] = { #ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, @@ -133,7 +142,7 @@ static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLO #endif }; -static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = { +static size_t zgemm_small_kernel_b0[] = { #ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, @@ -141,6 +150,9 @@ static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLA GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, #endif }; + +#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) +#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif @@ -163,7 +175,7 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#if defined (SMP) || defined(SMALL_MATRIX_OPT) +#ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX @@ -287,11 +299,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS XFLOAT *buffer; XFLOAT *sa, *sb; -#if defined (SMP) || defined(SMALL_MATRIX_OPT) - double MNK; -#endif - #ifdef SMP + double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE @@ -459,32 +468,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); -#if defined(SMP) || defined(SMALL_MATRIX_OPT) - MNK = (double) args.m * (double) args.n * (double) args.k; -#endif - #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ - (gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + (GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); }else{ - (gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + (GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); } return; } #else if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ if(beta[0] == 0.0 && beta[1] == 0.0){ - (zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); + (ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); }else{ - (zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); + (ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); } return; } #endif #endif - buffer = (XFLOAT *)blas_memory_alloc(0); @@ -497,7 +501,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS mode |= (transb << BLAS_TRANSB_SHIFT); #endif - + MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) args.nthreads = 1; else diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 1e846a61c..f303d0dc6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -171,6 +171,14 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif +#if BUILD_SINGLE == 1 +#ifdef SMALL_MATRIX_OPT + sgemm_small_matrix_permitTS, + sgemm_small_kernel_nnTS, sgemm_small_kernel_ntTS, sgemm_small_kernel_tnTS, sgemm_small_kernel_ttTS, + sgemm_small_kernel_b0_nnTS, sgemm_small_kernel_b0_ntTS, sgemm_small_kernel_b0_tnTS, sgemm_small_kernel_b0_ttTS, +#endif +#endif + #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -257,6 +265,11 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_DOUBLE==1) +#ifdef SMALL_MATRIX_OPT + dgemm_small_matrix_permitTS, + dgemm_small_kernel_nnTS, dgemm_small_kernel_ntTS, dgemm_small_kernel_tnTS, dgemm_small_kernel_ttTS, + dgemm_small_kernel_b0_nnTS, dgemm_small_kernel_b0_ntTS, dgemm_small_kernel_b0_tnTS, dgemm_small_kernel_b0_ttTS, +#endif dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -389,6 +402,18 @@ gotoblas_t TABLE_NAME = { #endif cgemm_oncopyTS, cgemm_otcopyTS, +#ifdef SMALL_MATRIX_OPT + cgemm_small_matrix_permitTS, + cgemm_small_kernel_nnTS, cgemm_small_kernel_ntTS, cgemm_small_kernel_nrTS, cgemm_small_kernel_ncTS, + cgemm_small_kernel_tnTS, cgemm_small_kernel_ttTS, cgemm_small_kernel_trTS, cgemm_small_kernel_tcTS, + cgemm_small_kernel_rnTS, cgemm_small_kernel_rtTS, cgemm_small_kernel_rrTS, cgemm_small_kernel_rcTS, + cgemm_small_kernel_cnTS, cgemm_small_kernel_ctTS, cgemm_small_kernel_crTS, cgemm_small_kernel_ccTS, + cgemm_small_kernel_b0_nnTS, cgemm_small_kernel_b0_ntTS, cgemm_small_kernel_b0_nrTS, cgemm_small_kernel_b0_ncTS, + cgemm_small_kernel_b0_tnTS, cgemm_small_kernel_b0_ttTS, cgemm_small_kernel_b0_trTS, cgemm_small_kernel_b0_tcTS, + cgemm_small_kernel_b0_rnTS, cgemm_small_kernel_b0_rtTS, cgemm_small_kernel_b0_rrTS, cgemm_small_kernel_b0_rcTS, + cgemm_small_kernel_b0_cnTS, cgemm_small_kernel_b0_ctTS, cgemm_small_kernel_b0_crTS, cgemm_small_kernel_b0_ccTS, +#endif + ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, @@ -533,6 +558,18 @@ gotoblas_t TABLE_NAME = { #endif zgemm_oncopyTS, zgemm_otcopyTS, +#ifdef SMALL_MATRIX_OPT + zgemm_small_matrix_permitTS, + zgemm_small_kernel_nnTS, zgemm_small_kernel_ntTS, zgemm_small_kernel_nrTS, zgemm_small_kernel_ncTS, + zgemm_small_kernel_tnTS, zgemm_small_kernel_ttTS, zgemm_small_kernel_trTS, zgemm_small_kernel_tcTS, + zgemm_small_kernel_rnTS, zgemm_small_kernel_rtTS, zgemm_small_kernel_rrTS, zgemm_small_kernel_rcTS, + zgemm_small_kernel_cnTS, zgemm_small_kernel_ctTS, zgemm_small_kernel_crTS, zgemm_small_kernel_ccTS, + zgemm_small_kernel_b0_nnTS, zgemm_small_kernel_b0_ntTS, zgemm_small_kernel_b0_nrTS, zgemm_small_kernel_b0_ncTS, + zgemm_small_kernel_b0_tnTS, zgemm_small_kernel_b0_ttTS, zgemm_small_kernel_b0_trTS, zgemm_small_kernel_b0_tcTS, + zgemm_small_kernel_b0_rnTS, zgemm_small_kernel_b0_rtTS, zgemm_small_kernel_b0_rrTS, zgemm_small_kernel_b0_rcTS, + zgemm_small_kernel_b0_cnTS, zgemm_small_kernel_b0_ctTS, zgemm_small_kernel_b0_crTS, zgemm_small_kernel_b0_ccTS, +#endif + ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, From fee5abd84bf01aba7a2223f7264fcc7da66d1b20 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 4 Aug 2021 08:50:15 +0000 Subject: [PATCH 387/681] Small Matrix: support cmake build --- cmake/system.cmake | 4 ++ kernel/CMakeLists.txt | 110 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index f8bd6678e..e51dc1fdc 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -258,6 +258,10 @@ if (NEED_PIC) endif() endif () +if (SMALL_MATRIX_OPT) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") +endif () + if (DYNAMIC_ARCH) if (X86 OR X86_64 OR ARM64 OR PPC) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index f0793bdef..769a73b91 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -458,7 +458,117 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c) + else () + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c) + else () + set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c) + else () + set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c) + else () + set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c) + else () + set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c) + endif () + endif () + if (SMALL_MATRIX_OPT) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type}) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) + + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + endif () + endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") From aa50185647ba6966dcdb731372af2ecd5ae3b1d4 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 02:45:53 +0000 Subject: [PATCH 388/681] Small Matrix: better handle with GEMM3M marco --- interface/gemm.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index f4b9f1537..775f654c3 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,6 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; +#ifndef GEMM3M #ifdef SMALL_MATRIX_OPT #ifndef DYNAMIC_ARCH #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) @@ -115,18 +116,14 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #ifndef COMPLEX static size_t gemm_small_kernel[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, -#endif }; static size_t gemm_small_kernel_b0[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, -#endif }; #define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) @@ -134,27 +131,24 @@ static size_t gemm_small_kernel_b0[] = { #else static size_t zgemm_small_kernel[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, -#endif }; static size_t zgemm_small_kernel_b0[] = { -#ifndef GEMM3M GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, -#endif }; #define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif +#endif #ifndef CBLAS @@ -468,6 +462,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); +#ifndef GEMM3M #ifdef SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ @@ -488,6 +483,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } #endif +#endif #endif buffer = (XFLOAT *)blas_memory_alloc(0); From 76ea8db4da1a651bb4de744162de1ecfc6762e7c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 02:57:58 +0000 Subject: [PATCH 389/681] Small Matrix: enable by default for x86_64 arch If no customized GEMM_SMALL_M_PERMIT kernel defined, it will just by pass to normal path. --- Makefile.system | 3 +++ cmake/system.cmake | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d8d2f2a..20db80d07 100644 --- a/Makefile.system +++ b/Makefile.system @@ -245,6 +245,9 @@ ONLY_CBLAS = 0 endif #For small matrix optimization +ifeq ($(ARCH), x86_64) +SMALL_MATRIX_OPT = 1 +endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif diff --git a/cmake/system.cmake b/cmake/system.cmake index e51dc1fdc..7d2672998 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -258,6 +258,9 @@ if (NEED_PIC) endif() endif () +if (X86_64) + set(SMALL_MATRIX_OPT TRUE) +endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () From 5d86becdaec262e8a2869ce909d94bec881fbfb6 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 5 Aug 2021 11:11:14 +0800 Subject: [PATCH 390/681] Add all SBGEMM kernels for IA AVX512-BF16 based platforms Added all SBGEMM kernels including NN/NT/TN/TT for both ColMajor and RowMajor, based on AVX512-BF16 ISA set on IA. Signed-off-by: Chen, Guobing --- kernel/x86_64/bf16_common_macros.h | 52 + .../x86_64/sbgemm_block_microk_cooperlake.c | 2024 ++++++++++++++--- .../sbgemm_microk_cooperlake_template.c | 1737 +++++++++++--- 3 files changed, 3268 insertions(+), 545 deletions(-) diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h index 1014ecc4d..78db7abb2 100644 --- a/kernel/x86_64/bf16_common_macros.h +++ b/kernel/x86_64/bf16_common_macros.h @@ -29,6 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#define _MM512_BROADCASTD_EPI32(addr, zmm) \ + __asm__ ("vpbroadcastd (%1), %0;" \ + : "=v" (zmm) \ + : "r" (addr) ) + +#define PREFETCH_T0(addr) \ + __asm__ ("prefetcht0 (%0);" \ + : \ + : "r" (addr) ) + #define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ reg256##_1 = _mm512_castps512_ps256(reg512##_1); @@ -721,6 +731,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm_mask_storeu_ps(targetAddr, mask, regResult); +/* Store 16 (result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm512_add_ps(regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_add_ps(regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm256_add_ps(regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_add_ps(regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm_add_ps(regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm_add_ps(regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + /* Store 16 (alpha * result) to y */ #define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 2376fed02..147c5ebdd 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1,4 +1,4 @@ -#include "sbgemm.h" +//#include "sbgemm.h" #include // Walk around those intrinsics that missed by compiler @@ -7,420 +7,1878 @@ #define MM256_STOREU_EPI16(addr, reg) \ _mm256_mask_storeu_epi16((addr), ~0, (reg)) -#include -void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat) -{ - printf("---- BLOCK %ld x %ld ----\n", m, n); - for (BLASLONG i=0; i> (32-m)); __m512i array512_0, array512_1, array512_2, array512_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0, idx_target_base1; + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0, * dst_addr1; BLASLONG LDA_2x = 2*lda; BLASLONG BF16_BLOCK_T_M_2x = 2*32; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - idx_target_base1 = 32; - for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); - array512_1 = _mm512_loadu_si512(&A[idx_src_base1]); - array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); - array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); - - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += BF16_BLOCK_T_M_2x; - idx_target_base1 += BF16_BLOCK_T_M_2x; - } - - if (tag_k_2x != k) { - __m512i ZERO512 = _mm512_setzero_si512(); - array512_0 = _mm512_loadu_si512(&A[idx_src_base0]); - array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); - array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); - } - -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif -} - -void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) -{ - BLASLONG tag_k_2x = k & (~1); - unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m)); - __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - - __m512i array512_0, array512_1, array512_2, array512_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0, idx_target_base1; + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + dst_addr1 = block_A + 32; - BLASLONG LDA_2x = 2*lda; - BLASLONG BF16_BLOCK_T_M_2x = 2*32; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - idx_target_base1 = 32; for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1); array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += BF16_BLOCK_T_M_2x; - idx_target_base1 += BF16_BLOCK_T_M_2x; + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += BF16_BLOCK_T_M_2x; + dst_addr1 += BF16_BLOCK_T_M_2x; } if (tag_k_2x != k) { __m512i ZERO512 = _mm512_setzero_si512(); - array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); - _mm512_storeu_si512(&block_A[idx_target_base0], array512_2); - _mm512_storeu_si512(&block_A[idx_target_base1], array512_3); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); } - -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif } +// INCOPY Kernel, 0> (16-m)); __m256i array256_0, array256_1, array256_2, array256_3; - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0; + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0; BLASLONG LDA_2x = 2*lda; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; + + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); - array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); + array256_1 = _mm256_maskz_loadu_epi16(tail_mask, src_addr1); array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += 32; + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += 32; } if (tag_k_2x != k) { __m256i ZERO256 = _mm256_setzero_si256(); - array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); } +} -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif +// K=32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_32x16(bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG LDA_4x = lda*4; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0); + array512_way0_1 = _mm512_loadu_si512(src_addr1); + array512_way0_2 = _mm512_loadu_si512(src_addr2); + array512_way0_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0); + array512_way1_1 = _mm512_loadu_si512(src_addr1); + array512_way1_2 = _mm512_loadu_si512(src_addr2); + array512_way1_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0); + array512_way2_1 = _mm512_loadu_si512(src_addr1); + array512_way2_2 = _mm512_loadu_si512(src_addr2); + array512_way2_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0); + array512_way3_1 = _mm512_loadu_si512(src_addr1); + array512_way3_2 = _mm512_loadu_si512(src_addr2); + array512_way3_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); } -void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +// K=Any number but will be processed based on 32, M=32 +void COL_MAJOR_ITCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) { - BLASLONG tag_k_2x = k & (~1); - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; - __m256i array256_0, array256_1, array256_2, array256_3; + BLASLONG tag_k_32x = k & (~31); - BLASLONG idx_src_base0, idx_src_base1; - BLASLONG idx_target_base0; + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + BLASLONG LDA_16x = lda*16; - BLASLONG LDA_2x = 2*lda; - idx_src_base0 = 0; - idx_src_base1 = lda; - idx_target_base0 = 0; - for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { - array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]); - array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); - array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); - // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*16; - idx_src_base0 += LDA_2x; - idx_src_base1 += LDA_2x; - idx_target_base0 += 32; + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + + src_addr0 += LDA_16x; + src_addr1 += LDA_16x; + src_addr2 += LDA_16x; + src_addr3 += LDA_16x; + dst_addr0 -= (64*7 - 32); + dst_addr1 -= (64*7 - 32); + } + src_addr0 -= (LDA_16x*2); + src_addr1 -= (LDA_16x*2); + src_addr2 -= (LDA_16x*2); + src_addr3 -= (LDA_16x*2); + dst_addr0 += (32*30); + dst_addr1 += (32*30); } - if (tag_k_2x != k) { - __m256i ZERO256 = _mm256_setzero_si256(); - array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]); - array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); - array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); - // Store in one row of block_B - MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2); - MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3); + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + } + } +} + +// K=Any number but will be processed based on 32, 16> 1; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + // Load and preprocess 4 rows + array512[array_idx+0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[array_idx+1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[array_idx+2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[array_idx+3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + + for (int j = 0; j < m_rem; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m_rem; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + } +} + +// K=Any number but will be processed based on 32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32*9; + dst_addr1 += 32*9; + } + + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } +} + +// K=Any number but will be processed based on 32, M<=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + src_addr0 = A; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512[16]; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int j = 0; j < m; j++) { + array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + dst_addr0 += 32*8; + dst_addr1 += 32*8; } -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A); -#endif + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + + for (int j = 0; j < m; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } } +// COL_MAJOR_ONCOPY_KERNEL_16x32 behaves exactly the same as COL_MAJOR_ITCOPY_KERNEL_Kx16 +#define COL_MAJOR_ONCOPY_KERNEL_16x32 COL_MAJOR_ITCOPY_KERNEL_Kx16 + void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) { BLASLONG tag_k_32x = k & (~31); - BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7; - BLASLONG idx_target_base0; - idx_src_base0 = 0; - idx_src_base1 = 1*ldb; - idx_src_base2 = 2*ldb; - idx_src_base3 = 3*ldb; - idx_src_base4 = 4*ldb; - idx_src_base5 = 5*ldb; - idx_src_base6 = 6*ldb; - idx_src_base7 = 7*ldb; - idx_target_base0 = 0; + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3, * src_addr4, * src_addr5, * src_addr6, * src_addr7; + bfloat16 * dst_addr0; + + unsigned char blend_mask = (((unsigned char)0xcc)); + __m512i permute_idx = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2); + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + src_addr4 = src_addr0 + 4*ldb; + src_addr5 = src_addr0 + 5*ldb; + src_addr6 = src_addr0 + 6*ldb; + src_addr7 = src_addr0 + 7*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k])); - idx_target_base0 += 32*8; + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_loadu_si512(src_addr4+idx_k); + array512_1 = _mm512_loadu_si512(src_addr5+idx_k); + array512_2 = _mm512_loadu_si512(src_addr6+idx_k); + array512_3 = _mm512_loadu_si512(src_addr7+idx_k); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + + dst_addr0 += 256; } if (tag_k_32x != k) { unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x])); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr4+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr5+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr6+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr7+tag_k_32x); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + } +} + +void COL_MAJOR_ONCOPY_KERNEL_4x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0; + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + dst_addr0 += 128; } -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); -#endif + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + } } void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) { BLASLONG tag_k_32x = k & (~31); BLASLONG tag_n_2x = n & (~1); - BLASLONG idx_src_base0; - BLASLONG idx_target_base0; + + bfloat16 * src_addr0; + bfloat16 * dst_addr0; BLASLONG LDB_2x = 2*ldb; - idx_target_base0 = 0; + src_addr0 = B; + dst_addr0 = block_B; for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { - idx_src_base0 = 0; + src_addr0 = B; for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k])); - idx_src_base0 += LDB_2x; - idx_target_base0 += 64; + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_loadu_si512(src_addr0 + ldb + idx_k)); + src_addr0 += LDB_2x; + dst_addr0 += 64; } if (tag_n_2x != n) { - _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k])); - idx_target_base0 += 32; + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + dst_addr0 += 32; } } if (tag_k_32x != k) { unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - idx_src_base0 = 0; + src_addr0 = B; for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); - _mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x])); - idx_src_base0 += LDB_2x; - idx_target_base0 += 64; + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + ldb + tag_k_32x)); + src_addr0 += LDB_2x; + dst_addr0 += 64; } if (tag_n_2x != n) { - _mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x])); + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); } } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask_value = (unsigned char) 0xff; + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); -#ifdef DEBUG_PROFILE - print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B); -#endif + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask = (((unsigned char)0xff) >> (8-n)); + + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } } -// Scale matrix C while beta is not ZERO or ONE +// Scale matrix C when beta is not ZERO or ONE void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) { - BLASLONG tag_n_Nx = N & (~3); - BLASLONG tag_n_Mx = M & (~15); + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; BLASLONG LDC4x = ldc*4; - BLASLONG idx_base_0 = 0; - BLASLONG idx_base_1 = ldc; - BLASLONG idx_base_2 = ldc*2; - BLASLONG idx_base_3 = ldc*3; - - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); __m512 array_512_0, array_512_1, array_512_2, array_512_3; + __m512 BETAVECTOR = _mm512_set1_ps(beta); - __m512 BETAVECTOR = _mm512_set1_ps(beta); + if (Order == CblasRowMajor) { + blasint tmp = M; + M = N; + N = tmp; + } - if (Order == CblasColMajor) { - for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); - array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]); - array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]); - array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]); + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); + array_512_1 = _mm512_loadu_ps(C_addr1 + idx_m); + array_512_2 = _mm512_loadu_ps(C_addr2 + idx_m); + array_512_3 = _mm512_loadu_ps(C_addr3 + idx_m); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); + _mm512_storeu_ps(C_addr1 + idx_m, array_512_1); + _mm512_storeu_ps(C_addr2 + idx_m, array_512_2); + _mm512_storeu_ps(C_addr3 + idx_m, array_512_3); + } - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); - array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); - array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); - - _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); - _mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1); - _mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2); - _mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3); - } + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_1 = _mm512_maskz_loadu_ps(tail_mask, C_addr1 + tag_n_Mx); + array_512_2 = _mm512_maskz_loadu_ps(tail_mask, C_addr2 + tag_n_Mx); + array_512_3 = _mm512_maskz_loadu_ps(tail_mask, C_addr3 + tag_n_Mx); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, array_512_1); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, array_512_2); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, array_512_3); + } - if (tag_n_Mx != M) { - array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); - array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]); - array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]); - array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]); + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); - array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); - array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); - - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); - _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1); - _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2); - _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3); + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); } - idx_base_0 += LDC4x; - idx_base_1 += LDC4x; - idx_base_2 += LDC4x; - idx_base_3 += LDC4x; - } - - if (tag_n_Nx != N) { - for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]); - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - _mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0); - } - - if (tag_n_Mx != M) { - array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]); - array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0); - } - idx_base_0 += ldc; + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); } + C_addr0 += ldc; } - } else { - } } -// Scale matrix C while beta is not ZERO or ONE +// Zero C matrix when Beta is 0 void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) { - BLASLONG tag_n_Nx = N & (~3); - BLASLONG tag_n_Mx = M & (~15); + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; BLASLONG LDC4x = ldc*4; - BLASLONG idx_base_0 = 0; - BLASLONG idx_base_1 = ldc; - BLASLONG idx_base_2 = ldc*2; - BLASLONG idx_base_3 = ldc*3; - - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); __m512 ZEROVECTOR = _mm512_setzero_ps(); - if (Order == CblasColMajor) { - for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR); - _mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR); - } + if (Order == CblasRowMajor) { + blasint tmp = M; + M = N; + N = tmp; + } - if (tag_n_Mx != M) { - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR); - _mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR); - } + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr1 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr2 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr3 + idx_m, ZEROVECTOR); + } - idx_base_0 += LDC4x; - idx_base_1 += LDC4x; - idx_base_2 += LDC4x; - idx_base_3 += LDC4x; + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, ZEROVECTOR); } - if (tag_n_Nx != N) { - for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { - for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { - _mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR); - } + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } - if (tag_n_Mx != M) { - _mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR); - } - idx_base_0 += ldc; + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); } - } - } else { + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + } + C_addr0 += ldc; + } } -} \ No newline at end of file +} diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index dd4cb440b..c71595813 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -2,45 +2,115 @@ #include "bf16_common_macros.h" #include +/* These macros are needed and should be placed at the right place +#define BF16_BLOCK_STEP_N 8 +#define BF16_BLOCK_THRES_K 1024 +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + +#define A(i,j) A[(i)*lda+(j)] +#define B(i,j) B[(i)*ldb+(j)] +#define C(i,j) C[(i)*ldc+(j)] + +#define ONE 1.e0f +#define ZERO 0.e0f +*/ + #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT -#undef SBGEMM_BLOCK_KERNEL_32x8x32 -#undef SBGEMM_BLOCK_KERNEL_16x8x32 -#undef SBGEMM_BLOCK_KERNEL_32xNx32 -#undef SBGEMM_BLOCK_KERNEL_16xNx32 -#undef SBGEMM_BLOCKING_KERNEL_2 +#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_NN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_NT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_NT_16xNxK +#undef SBGEMM_BLOCK_KERNEL_TN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_TN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_TT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_TT_16xNxK +#undef SBGEMM_BLOCKING_KERNEL_NN +#undef SBGEMM_BLOCKING_KERNEL_NT +#undef SBGEMM_BLOCKING_KERNEL_TN +#undef SBGEMM_BLOCKING_KERNEL_TT #ifndef ONE_ALPHA // ALPHA is not ONE - #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE - #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE - #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha - #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha - #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha - #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha - #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_alpha + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_alpha + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_alpha + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_alpha + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_alpha + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_alpha #else // ALPHA is ONE - #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE - #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE - #define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one - #define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one - #define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one - #define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one - #define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_one + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_one + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_one + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_one + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_one + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_one #endif +extern bfloat16 * block_A; +extern bfloat16 * block_B; +/* --------------------------------------------- NN kernels ------------------------------------------ */ // SBGEMM Kernel for 16> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask) - STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) } else { result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); - STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0])) - STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1])) - STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2])) - STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3])) + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); - STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4])) - STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5])) - STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6])) - STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7])) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) } } // SBGEMM Kernel for 16> (32-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (32-m)); for (int i = 0; i < n; i++) { result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); - STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) - STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) } } else { for (int i = 0; i < n; i++) { result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); - STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i])) - STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16])) + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) } } } // SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base #ifndef ONE_ALPHA // ALPHA is not ONE -void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +void sbgemm_block_kernel_nn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) #else // ALPHA is ONE -void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) #endif { + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); - BLASLONG idxB_base = 0; - BLASLONG width = 32; #ifndef ONE_ALPHA __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); @@ -432,21 +484,49 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a result_512[i+1] = _mm512_setzero_ps(); } - for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) { + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { // Load B with unroll n - for (int i = 0; i < n; i ++) { - arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]); - idxB_base += 32; + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } } + } - if (idx_k == tag_k_32x) {width = k - tag_k_32x;} + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + BLASLONG width = k - tag_k_32x; for (BLASLONG idx = 0; idx < width;) { // Each two rows are a group for 32-pair bf16 elements // Load two rows into a 512 register - arrayA_512 = _mm512_loadu_si512(&A[idx<<4]); + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; - for (int i = 0; i < n; i ++) { + for (int i = 0; i < n; i++) { result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); } @@ -462,23 +542,24 @@ void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float a } if (m != 16) { - unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); - __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); for (int i = 0; i < n; i++) { result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); - STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) } } else { for (int i = 0; i < n; i++) { result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); - STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i])) + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) } } } + + #ifndef ONE_ALPHA // ALPHA is not ONE -void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) #else // ALPHA is ONE -void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) #endif { BLASLONG m_step, n_step, k_step, k_step_round32; @@ -499,63 +580,52 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, while (n_from < N) { for (BLASLONG idx_k = 0; idx_k < K;) { // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... - COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A); - // TODO: MT + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); } for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { - COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); } } if (tag_m_Nx != M) { m_step = M - tag_m_Nx; if (m_step > 16) { - COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); - for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); - } - - if (tag_n_Nx != n_to) { - n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); - } - } else if (m_step == 16) { - COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); } } else { - COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { - SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); } } } @@ -573,22 +643,274 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); } } else { - m_step = M - tag_m_Nx; + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of NN kernels --------------------------------------- */ + +/* --------------------------------------------- NT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_nt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_nt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { while (n_from < N) { for (BLASLONG idx_k = 0; idx_k < K;) { // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... - COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A); - // TODO: MT + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... - COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); } if (tag_n_Nx != n_to) { n_step = n_to - tag_n_Nx; - COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); - SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } } idx_k += k_step; @@ -597,13 +919,884 @@ void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, k_step_round32 = k_step & (~31); k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; } + n_from = n_to; n_to += BF16_BLOCK_THRES_N; n_to = (n_to > N) ? N : n_to; tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } } } +/* ----------------------------------------- End of NT kernels --------------------------------------- */ + +/* --------------------------------------------- TN kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_MASK_COMPLETE_RESULT(result_512_8, (C_addr + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_MASK_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_MASK_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_MASK_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_MASK_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_MASK_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_MASK_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_MASK_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_8, (C_addr + 16)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16)) + } +} + +// SBGEMM Kernel for M=16, N=8, K=Any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Load 16 pair of BF16 elements from A (16 rows) + arrayA_512_0 = _mm512_loadu_si512(A_addr + 0); + + // Load 8 rows of B + _MM512_BROADCASTD_EPI32(B_addr + 0, arrayB_512_0); + _MM512_BROADCASTD_EPI32(B_addr + 2, arrayB_512_1); + _MM512_BROADCASTD_EPI32(B_addr + 4, arrayB_512_2); + _MM512_BROADCASTD_EPI32(B_addr + 6, arrayB_512_3); + _MM512_BROADCASTD_EPI32(B_addr + 8, arrayB_512_4); + _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5); + _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6); + _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7); + + // Load B with unroll 8 + B_addr += 16; + // Load A with unroll 32 + A_addr += 32; + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + } +} + +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K=Any number but will be processed based on 32 +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + + int SHUFFLE_MAGIC_NO = 0x39; + BLASLONG tag_k_32x = k & (~31); + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i++) { + result_512[i] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + BLASLONG width = k - tag_k_32x; + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); // TODO how to process m + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TN kernels --------------------------------------- */ + +/* --------------------------------------------- TT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TT kernels --------------------------------------- */ #ifndef ONE_ALPHA // ALPHA is not ONE void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, @@ -613,13 +1806,33 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_ OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) #endif { - bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M]; - bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K]; - - // TODO: assume no trans for both A and B, to complement these scenarios later if (Order == CblasColMajor) { - SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_NT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_TN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } } else { - + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } } -} \ No newline at end of file +} From 44d0032f3b8e9794d51b7807b3fb53905a2e9f1c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 04:43:47 +0000 Subject: [PATCH 391/681] Small Matrix: skylakex: fix build error in old compiler --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 4 ++-- kernel/x86_64/dgemm_small_kernel_nt_skylakex.c | 2 +- kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 4 ++-- kernel/x86_64/dgemm_small_kernel_tt_skylakex.c | 10 +++++----- kernel/x86_64/sgemm_small_kernel_nt_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_tt_skylakex.c | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index ff2a04beb..d9b380fff 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -372,8 +372,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (; i < m4; i += 4, mi += 4) { for (j = 0; j < n4; j += 4) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c index 0a95a68e2..e757197ba 100644 --- a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 0881f35b2..18c797283 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -105,8 +105,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m4; i += 4) { for (j = 0; j < n4; j += 4) { diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c index 8ff79d2c8..00f42aa76 100644 --- a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -189,8 +189,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, }; - __m512i idx_lo = _mm512_loadu_epi64(permute_table); - __m512i idx_hi = _mm512_loadu_epi64(permute_table + 8); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); for (i = 0; i < m8; i += 8) { for (j = 0; j < n16; j += 16) { @@ -235,8 +235,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, }; - idx_lo = _mm512_loadu_epi64(permute_table2); - idx_hi = _mm512_loadu_epi64(permute_table2 + 8); + idx_lo = _mm512_loadu_si512(permute_table2); + idx_hi = _mm512_loadu_si512(permute_table2 + 8); for (j = 0; j < n32; j += 32) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); @@ -289,7 +289,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 8; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi64(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); #endif diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c index f293bf9f9..a7d87f8c4 100644 --- a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -385,7 +385,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); for (; i < m4; i += 4) { for (j = 0; j < n64; j += 64) { DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c index 8da560ef7..023f58746 100644 --- a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -215,8 +215,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - __m512i idx_lo = _mm512_loadu_epi32(permute_table); - __m512i idx_hi = _mm512_loadu_epi32(permute_table + 16); + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); __mmask16 kc = 0xcccc; __mmask16 k3 = 0x3333; __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE @@ -311,7 +311,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp for (int ii = 0; ii < 16; ii++) { index_n[ii] = ii * ldc; } - __m512i vindex_n = _mm512_loadu_epi32(index_n); + __m512i vindex_n = _mm512_loadu_si512(index_n); #if !defined(B0) __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); #endif From c17d6dacb23f0862f6f0318c55c097c361132663 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 5 Aug 2021 05:46:13 +0000 Subject: [PATCH 392/681] Small Matrix: skip compile in unimplemented data type --- interface/gemm.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 775f654c3..3497d8651 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,8 +105,13 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#ifndef GEMM3M -#ifdef SMALL_MATRIX_OPT +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16) +#define USE_SMALL_MATRIX_OPT 1 +#else +#define USE_SMALL_MATRIX_OPT 0 +#endif + +#if USE_SMALL_MATRIX_OPT #ifndef DYNAMIC_ARCH #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) #else @@ -148,7 +153,6 @@ static size_t zgemm_small_kernel_b0[] = { #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) #endif #endif -#endif #ifndef CBLAS @@ -462,8 +466,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); -#ifndef GEMM3M -#ifdef SMALL_MATRIX_OPT +#if USE_SMALL_MATRIX_OPT #if !defined(COMPLEX) if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ if(*(FLOAT *)(args.beta) == 0.0){ @@ -483,7 +486,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } #endif -#endif #endif buffer = (XFLOAT *)blas_memory_alloc(0); From e5ba7c3235cd5ac9613e0989621c8d22294def5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Aug 2021 11:08:18 +0200 Subject: [PATCH 393/681] Disable all x86 jobs --- .travis.yml | 302 ++++++++++++++++++++++++++-------------------------- 1 file changed, 151 insertions(+), 151 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2a221e3bd..8657b64f4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,38 +55,38 @@ matrix: - TARGET_BOX=IBMZ_LINUX - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 CC=clang" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" - - - <<: *test-ubuntu - addons: - apt: - packages: - - gcc-multilib - - gfortran-multilib - env: - - TARGET_BOX=LINUX32 - - BTYPE="BINARY=32" - +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 USE_OPENMP=1" +# +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 CC=clang" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" +# +# - <<: *test-ubuntu +# addons: +# apt: +# packages: +# - gcc-multilib +# - gfortran-multilib +# env: +# - TARGET_BOX=LINUX32 +# - BTYPE="BINARY=32" +# - os: linux arch: ppc64le dist: bionic @@ -121,47 +121,47 @@ matrix: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX_P9 - - os: linux - compiler: gcc - addons: - apt: - packages: - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gfortran-mingw-w64-x86-64 - before_script: *common-before - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=WIN64 - - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - +# - os: linux +# compiler: gcc +# addons: +# apt: +# packages: +# - binutils-mingw-w64-x86-64 +# - gcc-mingw-w64-x86-64 +# - gfortran-mingw-w64-x86-64 +# before_script: *common-before +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=WIN64 +# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" +# # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. # These jobs needs sudo, so Travis runs them on VM-based infrastructure # which is slower than container-based infrastructure used for jobs # that don't require sudo. - - &test-alpine - os: linux - dist: trusty - sudo: true - language: minimal - before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - install: - - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - before_script: *common-before - script: - # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - - alpine make -C test $COMMON_FLAGS $BTYPE - - alpine make -C ctest $COMMON_FLAGS $BTYPE - - alpine make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64" + # - &test-alpine + # os: linux + # dist: trusty + # sudo: true + # language: minimal + # before_install: + # - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + # && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" + # - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + # install: + # - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + # before_script: *common-before + # script: + # # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. + # - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + # CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" + # - alpine make -C test $COMMON_FLAGS $BTYPE + # - alpine make -C ctest $COMMON_FLAGS $BTYPE + # - alpine make -C utest $COMMON_FLAGS $BTYPE + # env: + # - TARGET_BOX=LINUX64_MUSL + # - BTYPE="BINARY=64" # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, # but only on Travis CI, cannot reproduce it elsewhere. @@ -171,98 +171,98 @@ matrix: # - TARGET_BOX=LINUX64_MUSL # - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 INTERFACE64=1" +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# # Build with the same flags as Alpine do in OpenBLAS package. +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - # Build with the same flags as Alpine do in OpenBLAS package. - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" +# - &test-cmake +# os: linux +# compiler: clang +# addons: +# apt: +# packages: +# - gfortran +# - cmake +# dist: trusty +# sudo: true +# before_script: +# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" +# script: +# - mkdir build +# - CONFIG=Release +# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG +# - cmake --build build --config $CONFIG -- -j2 +# env: +# - CMAKE=1 +# - <<: *test-cmake +# env: +# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" +# - <<: *test-cmake +# compiler: gcc +# env: +# - CMAKE=1 - - &test-cmake - os: linux - compiler: clang - addons: - apt: - packages: - - gfortran - - cmake - dist: trusty - sudo: true - before_script: - - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" - script: - - mkdir build - - CONFIG=Release - - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG - - cmake --build build --config $CONFIG -- -j2 - env: - - CMAKE=1 - - <<: *test-cmake - env: - - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" - - <<: *test-cmake - compiler: gcc - env: - - CMAKE=1 - - - &test-macos - os: osx - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" - - - <<: *test-macos - osx_image: xcode12 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" - - - <<: *test-macos - osx_image: xcode12 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" +# - &test-macos +# os: osx +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" # - <<: *test-macos # osx_image: xcode10 # env: # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - env: +# - <<: *test-macos +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# env: # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - env: -# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" -# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" +# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" +# - <<: *test-macos +# osx_image: xcode11.5 +# env: +## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" +# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 os: linux From b06880c2cdfc8a0bd5caa2c1d62f7bba3611b932 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 10 Aug 2021 22:06:04 -0500 Subject: [PATCH 394/681] POWER10: Improving dasum performance Unrolling a loop in dasum micro code to help in improving POWER10 performance. --- kernel/power/dasum.c | 4 +- kernel/power/dasum_microk_power10.c | 120 ++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 7507621cf..35390dd24 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -115,14 +115,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) - if ( n >= 16 ) + if ( n >= 32) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; for (i = 0; i < align; i++) { sumf += ABS(x[i]); } } - n1 = (n-i) & -16; + n1 = (n-i) & -32; if ( n1 > 0 ) { sumf += dasum_kernel_16(n1, &x[i]); diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c index d1a21b4d1..110627fa4 100644 --- a/kernel/power/dasum_microk_power10.c +++ b/kernel/power/dasum_microk_power10.c @@ -34,6 +34,19 @@ static double dasum_kernel_16 (long n, double *x) __vector double t1; __vector double t2; __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + __vector double a0; + __vector double a1; + __vector double a2; + __vector double a3; + __vector double a4; + __vector double a5; + __vector double a6; + __vector double a7; + __asm__ ( @@ -48,14 +61,27 @@ static double dasum_kernel_16 (long n, double *x) "xxlxor 38, 38, 38 \n\t" "xxlxor 39, 39, 39 \n\t" + "xxlxor %x11, %x11, %x11 \n\t" + "xxlxor %x12, %x12, %x12 \n\t" + "xxlxor %x13, %x13, %x13 \n\t" + "xxlxor %x14, %x14, %x14 \n\t" + "xxlxor %x15, %x15, %x15 \n\t" + "xxlxor %x16, %x16, %x16 \n\t" + "xxlxor %x17, %x17, %x17 \n\t" + "xxlxor %x18, %x18, %x18 \n\t" + "lxvp 40, 0(%2) \n\t" "lxvp 42, 32(%2) \n\t" "lxvp 44, 64(%2) \n\t" "lxvp 46, 96(%2) \n\t" + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" - "addi %2, %2, 128 \n\t" + "addi %2, %2, 256 \n\t" - "addic. %1, %1, -16 \n\t" + "addic. %1, %1, -32 \n\t" "ble two%= \n\t" ".align 5 \n" @@ -65,33 +91,52 @@ static double dasum_kernel_16 (long n, double *x) "xvabsdp 49, 41 \n\t" "xvabsdp 50, 42 \n\t" "xvabsdp 51, 43 \n\t" - "lxvp 40, 0(%2) \n\t" - "xvabsdp %x3, 44 \n\t" "xvabsdp %x4, 45 \n\t" - "lxvp 42, 32(%2) \n\t" - - "xvabsdp %x5, 46 \n\t" "xvabsdp %x6, 47 \n\t" - "lxvp 44, 64(%2) \n\t" - "xvadddp 32, 32, 48 \n\t" "xvadddp 33, 33, 49 \n\t" - - "lxvp 46, 96(%2) \n\t" - "xvadddp 34, 34, 50 \n\t" "xvadddp 35, 35, 51 \n\t" - "addi %2, %2, 128 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "xvadddp 36, 36, %x3 \n\t" "xvadddp 37, 37, %x4 \n\t" - "addic. %1, %1, -16 \n\t" "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" "two%=: \n\t" @@ -114,6 +159,25 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + "xvadddp 32, 32, 33 \n\t" "xvadddp 34, 34, 35 \n\t" "xvadddp 36, 36, 37 \n\t" @@ -122,7 +186,18 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 32, 32, 34 \n\t" "xvadddp 36, 36, 38 \n\t" + "xvadddp %x11, %x11, %x12 \n\t" + "xvadddp %x13, %x13, %x14 \n\t" + "xvadddp %x15, %x15, %x16 \n\t" + "xvadddp %x17, %x17, %x18 \n\t" + + "xvadddp %x11, %x11, %x13 \n\t" + "xvadddp %x15, %x15, %x17 \n\t" + + "xvadddp %x11, %x11, %x15 \n\t" + "xvadddp 32, 32, 36 \n\t" + "xvadddp 32, 32, %x11 \n\t" XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" @@ -136,14 +211,27 @@ static double dasum_kernel_16 (long n, double *x) "=wa" (t0), // 3 "=wa" (t1), // 4 "=wa" (t2), // 5 - "=wa" (t3) // 6 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5), // 8 + "=wa" (t6), // 9 + "=wa" (t7), // 10 + "=wa" (a0), // 11 + "=wa" (a1), // 12 + "=wa" (a2), // 13 + "=wa" (a3), // 14 + "=wa" (a4), // 15 + "=wa" (a5), // 16 + "=wa" (a6), // 17 + "=wa" (a7) // 18 : "m" (*x) : "cr0", "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", - "vs48","vs49","vs50","vs51" + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" ); return sum; From c28560129f65c212eba0093e99f4c9163856bffa Mon Sep 17 00:00:00 2001 From: cianciosa Date: Wed, 11 Aug 2021 12:00:07 -0400 Subject: [PATCH 395/681] Check the total number of arguments passed insead of if the ARGV# is defined. This fixes a problem when compling openblas as a subproject of another code. --- cmake/utils.cmake | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6b54092ea..09bae7011 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -157,31 +157,31 @@ endfunction () # STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) - if (DEFINED ARGV1) + if (${ARGC} GREATER 1) set(defines_in ${ARGV1}) endif () - if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") + if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "") set(name_in ${ARGV2}) # strip off extension for kernel files that pass in the object name. get_filename_component(name_in ${name_in} NAME_WE) endif () - if (DEFINED ARGV3) + if (${ARGC} GREATER 3) set(use_cblas ${ARGV3}) else () set(use_cblas false) endif () - if (DEFINED ARGV4) + if (${ARGC} GREATER 4) set(replace_last_with ${ARGV4}) endif () - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(append_with ${ARGV5}) endif () - if (DEFINED ARGV6) + if ${ARGC} GREATER 6) set(no_float_type ${ARGV6}) else () set(no_float_type false) @@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in) set(real_only false) set(complex_only false) set(mangle_complex_sources false) - if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") + if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "") if (${ARGV7} EQUAL 1) set(real_only true) elseif (${ARGV7} EQUAL 2) @@ -342,17 +342,17 @@ endfunction () function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) set(alternate_name_in "") - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(alternate_name_in ${ARGV5}) endif () set(no_float_type false) - if (DEFINED ARGV6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) endif () set(complex_filename_scheme "") - if (DEFINED ARGV7) + if (${ARGC} GREATER 7) set(complex_filename_scheme ${ARGV7}) endif () From 4c766cd11fa3f27ed1b572225ab2e937e43a2bab Mon Sep 17 00:00:00 2001 From: cianciosa Date: Wed, 11 Aug 2021 12:08:34 -0400 Subject: [PATCH 396/681] Fix a small syntax error. A ( was accidently deleted. --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 09bae7011..01b489f2a 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -181,7 +181,7 @@ function(GenerateNamedObjects sources_in) set(append_with ${ARGV5}) endif () - if ${ARGC} GREATER 6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) else () set(no_float_type false) From a7bc8ec1f107a95a18cfcdbd5c47721abfa75cb9 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 10 Aug 2021 16:42:57 +0800 Subject: [PATCH 397/681] Delete the macro instruction "li" and use "li.d" instead Change-Id: Icff7981e2eb7df29ba5af1f8eb5be8443c67450f --- kernel/loongarch64/asum.S | 2 +- kernel/loongarch64/cnrm2.S | 2 +- kernel/loongarch64/copy.S | 2 +- kernel/loongarch64/dot.S | 2 +- kernel/loongarch64/gemv_n.S | 4 ++-- kernel/loongarch64/gemv_t.S | 2 +- kernel/loongarch64/iamax.S | 12 ++++++------ kernel/loongarch64/iamin.S | 12 ++++++------ kernel/loongarch64/izamax.S | 12 ++++++------ kernel/loongarch64/izamin.S | 12 ++++++------ kernel/loongarch64/scal.S | 2 +- kernel/loongarch64/snrm2.S | 2 +- kernel/loongarch64/swap.S | 2 +- kernel/loongarch64/zcopy.S | 2 +- kernel/loongarch64/zdot.S | 2 +- kernel/loongarch64/zgemv_n.S | 4 ++-- kernel/loongarch64/zgemv_t.S | 2 +- kernel/loongarch64/zscal.S | 2 +- 18 files changed, 40 insertions(+), 40 deletions(-) diff --git a/kernel/loongarch64/asum.S b/kernel/loongarch64/asum.S index e4c717085..7d21ce038 100644 --- a/kernel/loongarch64/asum.S +++ b/kernel/loongarch64/asum.S @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 MTC s2, $r0 slli.d INCX, INCX, BASE_SHIFT - li TEMP, SIZE + li.d TEMP, SIZE bge $r0, N, .L999 srai.d I, N, 3 bne INCX, TEMP, .L20 diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S index c4b2555d3..9d27987e1 100644 --- a/kernel/loongarch64/cnrm2.S +++ b/kernel/loongarch64/cnrm2.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movgr2fr.d s1, $r0 - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT diff --git a/kernel/loongarch64/copy.S b/kernel/loongarch64/copy.S index 28b7bce4c..3156f60b8 100644 --- a/kernel/loongarch64/copy.S +++ b/kernel/loongarch64/copy.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCY, 0(INCY) #endif - li TEMP, SIZE + li.d TEMP, SIZE NOP slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S index 4fcd569c8..1e4c81a02 100644 --- a/kernel/loongarch64/dot.S +++ b/kernel/loongarch64/dot.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 MTC s2, $r0 slli.d INCX, INCX, BASE_SHIFT - li TEMP, SIZE + li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 srai.d I, N, 3 diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S index 334a2991f..9ab43ae19 100644 --- a/kernel/loongarch64/gemv_n.S +++ b/kernel/loongarch64/gemv_n.S @@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 - li I, SIZE + li.d I, SIZE move YORIG, Y beq INCY, I, .L10 srai.d I, M, 2 @@ -472,7 +472,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L900: - li YORIG, SIZE + li.d YORIG, SIZE srai.d I, M, 2 beq INCY, YORIG, .L999 move XX, BUFFER diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S index 19333ed4a..af4232769 100644 --- a/kernel/loongarch64/gemv_t.S +++ b/kernel/loongarch64/gemv_t.S @@ -88,7 +88,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, BASE_SHIFT bge $r0, N, .L999 - li I, SIZE + li.d I, SIZE move XORIG, X beq INCX, I, .L10 srai.d I, M, 2 diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S index 0f9e1bc59..31b1a9e57 100644 --- a/kernel/loongarch64/iamax.S +++ b/kernel/loongarch64/iamax.S @@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT bge $r0, INCX, .L999 LD a1, X, 0 * SIZE addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 FABS s1, a1 add.d X, X, INCX FABS s2, a1 - li x2, 1 + li.d x2, 1 FABS s3, a1 srai.d I, N, 3 FABS s4, a1 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE add.d X, X, INCX diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S index 7751a9d03..9364b9725 100644 --- a/kernel/loongarch64/iamin.S +++ b/kernel/loongarch64/iamin.S @@ -62,24 +62,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT bge $r0, INCX, .L999 LD a1, X, 0 * SIZE addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 FABS s1, a1 add.d X, X, INCX FABS s2, a1 - li x2, 1 + li.d x2, 1 FABS s3, a1 srai.d I, N, 3 FABS s4, a1 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE add.d X, X, INCX diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S index 6d7cb9e30..8d3ae529e 100644 --- a/kernel/loongarch64/izamax.S +++ b/kernel/loongarch64/izamax.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, INCX, .L999 @@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ADD s3, t1, t2 ADD s4, t1, t2 addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 add.d X, X, INCX - li x2, 1 + li.d x2, 1 srai.d I, N, 2 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S index 998927985..38a109c21 100644 --- a/kernel/loongarch64/izamin.S +++ b/kernel/loongarch64/izamin.S @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - li x1, 0 + li.d x1, 0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, INCX, .L999 @@ -79,14 +79,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ADD s3, t1, t2 ADD s4, t1, t2 addi.d N, N, -1 - li x1, 1 + li.d x1, 1 bge $r0, N, .L999 add.d X, X, INCX - li x2, 1 + li.d x2, 1 srai.d I, N, 2 - li x3, 1 - li TEMP, 2 - li x4, 1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 bge $r0, I, .L15 LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S index 7399e57b3..566bce6cb 100644 --- a/kernel/loongarch64/scal.S +++ b/kernel/loongarch64/scal.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, SIZE + li.d TEMP, SIZE MTC a1, $r0 slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S index 14b62cfe7..57c21a017 100644 --- a/kernel/loongarch64/snrm2.S +++ b/kernel/loongarch64/snrm2.S @@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif movgr2fr.d s1, $r0 - li TEMP, SIZE + li.d TEMP, SIZE fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S index c9d8f7fc1..4578a8d54 100644 --- a/kernel/loongarch64/swap.S +++ b/kernel/loongarch64/swap.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, SIZE + li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT bge $r0, N, .L999 slli.d INCY, INCY, BASE_SHIFT diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S index 3fbe56074..0f480ca85 100644 --- a/kernel/loongarch64/zcopy.S +++ b/kernel/loongarch64/zcopy.S @@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCY, 0(INCY) #endif - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE NOP slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S index 087c3845f..81ac19fbd 100644 --- a/kernel/loongarch64/zdot.S +++ b/kernel/loongarch64/zdot.S @@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MOV s3, s2 MOV s4, s3 slli.d INCX, INCX, ZBASE_SHIFT - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 srai.d I, N, 2 diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S index 0cc49c789..d995ce86b 100644 --- a/kernel/loongarch64/zgemv_n.S +++ b/kernel/loongarch64/zgemv_n.S @@ -123,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 - li I, 2 * SIZE + li.d I, 2 * SIZE move YORIG, Y beq INCY, I, .L10 srai.d I, M, 2 @@ -576,7 +576,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L900: - li YORIG, 2 * SIZE + li.d YORIG, 2 * SIZE srai.d I, M, 2 beq INCY, YORIG, .L999 move XX, BUFFER diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S index 85a9a0c0d..841823e1c 100644 --- a/kernel/loongarch64/zgemv_t.S +++ b/kernel/loongarch64/zgemv_t.S @@ -116,7 +116,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, M, .L999 slli.d INCY, INCY, ZBASE_SHIFT bge $r0, N, .L999 - li I, 2 * SIZE + li.d I, 2 * SIZE move XORIG, X beq INCX, I, .L10 srai.d I, M, 2 diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S index fe53ed713..a12e527a5 100644 --- a/kernel/loongarch64/zscal.S +++ b/kernel/loongarch64/zscal.S @@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - li TEMP, 2 * SIZE + li.d TEMP, 2 * SIZE MTC a1, $r0 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 From 989e6bbdd39fe3d49789b803c4fd6b20a3a673e5 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 03:17:38 +0000 Subject: [PATCH 398/681] Small Matrix: reduce generic kernel source files --- kernel/CMakeLists.txt | 56 ++++----- kernel/Makefile.L3 | 112 +++++++++--------- .../generic/gemm_small_matrix_kernel_b0_nn.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_nt.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_tn.c | 49 -------- .../generic/gemm_small_matrix_kernel_b0_tt.c | 49 -------- kernel/generic/gemm_small_matrix_kernel_nn.c | 11 +- kernel/generic/gemm_small_matrix_kernel_nt.c | 9 +- kernel/generic/gemm_small_matrix_kernel_tn.c | 8 ++ kernel/generic/gemm_small_matrix_kernel_tt.c | 8 ++ .../generic/zgemm_small_matrix_kernel_b0_nn.c | 74 ------------ .../generic/zgemm_small_matrix_kernel_b0_nt.c | 77 ------------ .../generic/zgemm_small_matrix_kernel_b0_tn.c | 77 ------------ .../generic/zgemm_small_matrix_kernel_b0_tt.c | 77 ------------ kernel/generic/zgemm_small_matrix_kernel_nn.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_nt.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_tn.c | 11 ++ kernel/generic/zgemm_small_matrix_kernel_tt.c | 11 ++ 18 files changed, 161 insertions(+), 588 deletions(-) delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nn.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_nt.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tn.c delete mode 100644 kernel/generic/gemm_small_matrix_kernel_b0_tt.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nn.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_nt.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tn.c delete mode 100644 kernel/generic/zgemm_small_matrix_kernel_b0_tt.c diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 769a73b91..d8a230436 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -495,30 +495,30 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_b0_nn.c) + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c) else () - set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_b0_nn.c) + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_b0_nt.c) + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c) else () - set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_b0_nt.c) + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_b0_tn.c) + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c) else () - set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_b0_tn.c) + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) endif () endif () if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") - set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_b0_tt.c) + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c) else () - set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_b0_tt.c) + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) endif () endif () @@ -541,32 +541,32 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) else () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f977793a0..ef11e391c 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4334,32 +4334,32 @@ $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ ifndef DGEMM_SMALL_K_B0_NN -DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef DGEMM_SMALL_K_B0_NT -DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef DGEMM_SMALL_K_B0_TN -DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef DGEMM_SMALL_K_B0_TT -DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ ifndef SGEMM_SMALL_M_PERMIT SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c @@ -4397,32 +4397,32 @@ $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifndef SGEMM_SMALL_K_B0_NN -SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef SGEMM_SMALL_K_B0_NT -SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef SGEMM_SMALL_K_B0_TN -SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef SGEMM_SMALL_K_B0_TT -SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ ifndef CGEMM_SMALL_M_PERMIT CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c @@ -4496,68 +4496,68 @@ $(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ ifndef CGEMM_SMALL_K_B0_NN -CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif ifndef CGEMM_SMALL_K_B0_NT -CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c endif ifndef CGEMM_SMALL_K_B0_TN -CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c endif ifndef CGEMM_SMALL_K_B0_TT -CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif $(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ ifndef ZGEMM_SMALL_M_PERMIT ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c @@ -4632,65 +4632,65 @@ $(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ ifndef ZGEMM_SMALL_K_B0_NN -ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c endif ifndef ZGEMM_SMALL_K_B0_NT -ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c endif ifndef ZGEMM_SMALL_K_B0_TN -ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c endif ifndef ZGEMM_SMALL_K_B0_TT -ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c endif $(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ diff --git a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c b/kernel/generic/gemm_small_matrix_kernel_b0_nn.c deleted file mode 100644 index 3be918017..000000000 --- a/kernel/generic/gemm_small_matrix_kernel_b0_nn.c +++ /dev/null @@ -1,49 +0,0 @@ -/*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) -{ - //naive implemtation - //Column major - - BLASLONG i,j,k; - FLOAT result=0.0; - - for(i=0; i Date: Fri, 13 Aug 2021 03:28:44 +0000 Subject: [PATCH 399/681] Small Matrix: skylakex: remove unnecessary b0 source files --- kernel/x86_64/KERNEL.SKYLAKEX | 16 ++++++++-------- .../x86_64/dgemm_small_kernel_b0_nn_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_nt_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_tn_skylakex.c | 2 -- .../x86_64/dgemm_small_kernel_b0_tt_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_nn_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_nt_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_tn_skylakex.c | 2 -- .../x86_64/sgemm_small_kernel_b0_tt_skylakex.c | 3 --- 9 files changed, 8 insertions(+), 25 deletions(-) delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c delete mode 100644 kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c delete mode 100644 kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index eb0cbaf98..6b4961bc2 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -12,13 +12,13 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SGEMM_SMALL_M_PERMIT = sgemm_small_kernel_permit_skylakex.c SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_skylakex.c -SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_b0_nn_skylakex.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_skylakex.c SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_skylakex.c -SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_b0_nt_skylakex.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_skylakex.c SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_skylakex.c -SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_b0_tn_skylakex.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c -SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_b0_tt_skylakex.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c @@ -29,13 +29,13 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c -DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_b0_nn_skylakex.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_skylakex.c -DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_b0_nt_skylakex.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_skylakex.c DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_skylakex.c -DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_b0_tn_skylakex.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_skylakex.c DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_skylakex.c -DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_b0_tt_skylakex.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_skylakex.c SGEMM_BETA = sgemm_beta_skylakex.c DGEMM_BETA = dgemm_beta_skylakex.c diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c deleted file mode 100644 index a58738a25..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_nn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c deleted file mode 100644 index eafe2ce49..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_nt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c deleted file mode 100644 index 1dfa0aaf1..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_tn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c deleted file mode 100644 index 93fab1836..000000000 --- a/kernel/x86_64/dgemm_small_kernel_b0_tt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./dgemm_small_kernel_tt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c deleted file mode 100644 index 704e964b8..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_nn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_nn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c deleted file mode 100644 index 6d7934be1..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_nt_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_nt_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c deleted file mode 100644 index 0f9745b72..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_tn_skylakex.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sgemm_small_kernel_tn_skylakex.c" diff --git a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c deleted file mode 100644 index 27d9e0afd..000000000 --- a/kernel/x86_64/sgemm_small_kernel_b0_tt_skylakex.c +++ /dev/null @@ -1,3 +0,0 @@ -#define B0 1 -#define TT 1 -#include "./sgemm_small_kernel_tt_skylakex.c" From 13d411677f4b0a617142b3fd4c15d7be4c442477 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Aug 2021 00:17:23 +0200 Subject: [PATCH 400/681] Add more OSX build jobs to Azure CI (#3338) * Add OSX build job with Homebrew OpenMP in a CMAKE build * Check install step on OSX/gcc to make sure all include files are generated and installed as intended * Add mixed clang/gfortran build with cmake on OSX * move IOS ARMV7/ARMV8 crossbuilds from travis to azure --- azure-pipelines.yml | 56 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 889b920e3..b1bded639 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -83,6 +83,8 @@ jobs: - script: | brew update make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install + ls -lR ../blasinst - job: OSX_GCC_Nothreads pool: @@ -104,6 +106,38 @@ jobs: brew install llvm libomp make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 +- job: OSX_OpenMP_Clang_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + mkdir build + cd build + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. + make + ctest + +- job: OSX_OpenMP_Clang_gf_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + mkdir build + cd build + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. + make + ctest + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' @@ -146,7 +180,27 @@ jobs: brew install --cask android-ndk export ANDROID_NDK_HOME=/usr/local/share/android-ndk make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 - + +- job: OSX_IOS_ARMV8 + pool: + vmImage: 'macOS-10.15' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 + steps: + - script: | + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + +- job: OSX_IOS_ARMV7 + pool: + vmImage: 'macOS-10.15' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 + steps: + - script: | + make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - job: ALPINE_MUSL pool: vmImage: 'ubuntu-latest' From cdb5d2737e92d17c600903bf97ac32d1659ce324 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Mon, 16 Aug 2021 11:22:51 +0100 Subject: [PATCH 401/681] add support for building on windows/arm64 target --- common_arm64.h | 2 +- ctest.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common_arm64.h b/common_arm64.h index 2270ffba7..029e23886 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ .text ; .p2align 2 ; .global REALNAME ; -#ifndef __APPLE__ +#if !defined(__APPLE__) && !defined(_WIN32) .type REALNAME, %function ; #endif REALNAME: diff --git a/ctest.c b/ctest.c index 4f18918f5..2afd93f68 100644 --- a/ctest.c +++ b/ctest.c @@ -84,7 +84,7 @@ OS_AIX OS_OSF #endif -#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) +#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT) OS_WINNT #endif @@ -141,7 +141,7 @@ ARCH_SPARC ARCH_IA64 #endif -#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) +#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__) BINARY_64 #endif From c6c2a71fb7c4ea36558c911f964557b7ac3a35c8 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Mon, 16 Aug 2021 11:25:07 +0100 Subject: [PATCH 402/681] Fix ctest.h to build using clang on windows --- utest/ctest.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utest/ctest.h b/utest/ctest.h index 037f7f28d..79961badf 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -65,9 +65,14 @@ struct ctest { #undef CTEST_SEGFAULT #endif -#if defined(_WIN32) && defined(_MSC_VER) +#if defined(_WIN32) +#if defined(__clang__) +#define __CTEST_NO_TIME +#undef CTEST_SEGFAULT +#elif defined(_MSC_VER) #define __CTEST_MSVC #endif +#endif //config for MSVC compiler #ifdef __CTEST_MSVC @@ -286,7 +291,7 @@ void assert_dbl_far(double exp, double real, double tol, const char* caller, int #endif #include -#ifdef __CTEST_MSVC +#ifdef _WIN32 #include #else #include From e9acb464318618009d13ddcc7e30dc300e878052 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 25 Aug 2021 07:07:27 +0000 Subject: [PATCH 403/681] sgemv: skylakex: bug fix for sgemv_t kernel in corner case --- kernel/x86_64/sgemv_t_4.c | 2 +- .../x86_64/sgemv_t_microk_skylakex_template.c | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 76236cd16..a36c8ace9 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_haswell-4.c" #elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" -/*#include "sgemv_t_microk_skylakex.c"*/ +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c index 34415054c..423413465 100644 --- a/kernel/x86_64/sgemv_t_microk_skylakex_template.c +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -93,7 +93,7 @@ static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float * } if (tag_m_32x != m) { - for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_16x; idx_m+=32) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); @@ -145,8 +145,8 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * } if (tag_m_32x != m) { for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { - m0 = _mm512_loadu_ps(&a[idx_m]); - m1 = _mm512_loadu_ps(&a[idx_m + 16]); + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); @@ -157,7 +157,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * __mmask8 load_mask = *((__mmask8*) &load_mask_value); x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { - m0 = _mm512_loadu_ps(&a[idx_m]); + m0 = _mm512_loadu_ps(&a[idx_m*2]); m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); @@ -171,7 +171,7 @@ static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float * unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); __mmask8 y_mask = *((__mmask8*) &y_mask_value); - m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x]); + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x*2]); m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); @@ -346,7 +346,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float * c3 = _mm256_extractf32x4_ps(c256_2, 0); c4 = _mm256_extractf32x4_ps(c256_2, 1); - ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, y)); + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, &y[idx_m])); _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); } @@ -958,6 +958,7 @@ static int sgemv_kernel_t_7(BLASLONG m, float alpha, float *a, float *x, float * c256_1 = _mm512_extractf32x8_ps(tmp0, 1); c256_0 = _mm256_add_ps(c256_0, c256_1); + c256_0 = _mm256_mul_ps(c256_0, alpha256); __m128 c128_0 = _mm256_extractf32x4_ps(c256_0, 0); __m128 c128_1 = _mm256_extractf32x4_ps(c256_0, 1); @@ -1016,9 +1017,10 @@ static int sgemv_kernel_t_8(BLASLONG m, float alpha, float *a, float *x, float * __m512 m0, m1, m2, m3; __m256 r0, r1, r2, r3, r4, r5, r6, r7, tmp0, tmp1, tmp2, tmp3; __m128 c128_0, c128_1, c128_2, c128_3; - __m128 alpha128 = _mm_set1_ps(alpha); + __m256 alpha256 = _mm256_set1_ps(alpha); __m256 x256 = _mm256_loadu_ps(x); + x256 = _mm256_mul_ps(x256, alpha256); __m512 x512 = _mm512_broadcast_f32x8(x256); for(BLASLONG idx_m=0; idx_m Date: Wed, 25 Aug 2021 07:13:00 +0000 Subject: [PATCH 404/681] sgemv: skylakex: fix build warning --- kernel/x86_64/sgemv_n_4.c | 3 --- kernel/x86_64/sgemv_t_microk_skylakex_template.c | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 06de28d97..90865c4b3 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -302,9 +302,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT * xbuffer_align = x; FLOAT * ybuffer_align = y; - FLOAT * xbuffer = NULL; - FLOAT * ybuffer = NULL; - if (inc_x != 1) { xbuffer_align = buffer; for(BLASLONG i=0; i> (16-((m-tag_m_8x)*2)&15)); + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(((m-tag_m_8x)*2)&15))); __mmask16 a_mask = *((__mmask16*) &tail_mask_value); unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); __mmask8 y_mask = *((__mmask8*) &y_mask_value); @@ -322,7 +322,7 @@ static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float * { BLASLONG tag_m_4x = m & (~3); BLASLONG tag_m_2x = m & (~1); - __m512 m0, m1, m2; + __m512 m0, m1; __m256 m256_0, m256_1, c256_1, c256_2; __m128 c1, c2, c3, c4, ret; __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); From 7d1becc575d436039f1484259a10413aade9cda9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 14:18:36 +0200 Subject: [PATCH 405/681] Allocate an auxiliary struct when running out of preconfigured threads --- driver/others/memory.c | 145 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 3 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 460a3d557..377e073ee 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2636,8 +2636,25 @@ static volatile struct { } memory[NUM_BUFFERS]; -static int memory_initialized = 0; +static volatile struct newmemstruct +{ + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif +}; +static volatile struct newmemstruct *newmemory; + +static int memory_initialized = 0; +static int memory_overflowed = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ @@ -2779,6 +2796,29 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + if (memory_overflowed) { +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + do { + RMB; +#if defined(USE_OPENMP) + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); +#endif + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + +#if defined(USE_OPENMP) + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } +#endif + position ++; + + } while (position < 512+NUM_BUFFERS); +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif +} goto error; allocation : @@ -2883,6 +2923,90 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: + if (memory_overflowed) goto terminate; + printf("num_buffers exceeded, adding auxiliary array\n"); + memory_overflowed=1; + newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (int i=0;i<512;i++) { + newmemory[i].addr = (void *)0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[i].pos = -1; +#endif + newmemory[i].used = 0; + newmemory[i].lock = 0; +} + newmemory[position-NUM_BUFFERS].used = 1; + +allocation2: + newmemory[position-NUM_BUFFERS].used = 1; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#else + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); +#endif + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + newmemory[position-NUM_BUFFERS].addr = map_address; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); +//#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; + +#endif + return (void *)newmemory[position-NUM_BUFFERS].addr; + +terminate: printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); @@ -2907,13 +3031,28 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (position >= NUM_BUFFERS) goto error; + if (position >= NUM_BUFFERS && !memory_overflowed) goto error; #ifdef DEBUG if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif + if (memory_overflowed) { + while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + position++; + // arm: ensure all writes are finished before other thread takes this memory + WMB; + newmemory[position].used = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf("Unmap from overflow area succeeded.\n\n"); +//#endif + return; +} else { // arm: ensure all writes are finished before other thread takes this memory WMB; @@ -2927,7 +3066,7 @@ void blas_memory_free(void *free_area){ #endif return; - +} error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); From b4b952eece8344fe5d7adf2352791ab81d0d1d8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:03:53 +0200 Subject: [PATCH 406/681] Add auxiliary tracking space for thread buffer frees too --- driver/others/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 377e073ee..d4fdfa465 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2060,6 +2060,7 @@ struct release_t { int hugetlb_allocated = 0; static struct release_t release_info[NUM_BUFFERS]; +static struct release_t *new_release_info; static int release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -2110,8 +2111,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2274,8 +2280,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + { else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2307,8 +2318,13 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; + } release_pos ++; } @@ -2341,8 +2357,13 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; + } release_pos ++; } @@ -2370,8 +2391,13 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; + } release_pos ++; } @@ -2414,9 +2440,15 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; + } release_pos ++; } @@ -2450,9 +2482,15 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = shmid; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; + } release_pos ++; } @@ -2556,8 +2594,13 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; + } release_pos ++; } @@ -2604,9 +2647,15 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; + } release_pos ++; } @@ -2663,6 +2712,8 @@ static int memory_overflowed = 0; void *blas_memory_alloc(int procpos){ + int i; + int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos = 0; @@ -2926,8 +2977,9 @@ void *blas_memory_alloc(int procpos){ if (memory_overflowed) goto terminate; printf("num_buffers exceeded, adding auxiliary array\n"); memory_overflowed=1; - newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); - for (int i=0;i<512;i++) { + new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (i = 0; i < 512; i++) { newmemory[i].addr = (void *)0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[i].pos = -1; @@ -3101,7 +3153,10 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { + if (pos < NUM_BUFFERS) release_info[pos].func(&release_info[pos]); + else + new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); } #ifdef SEEK_ADDRESS @@ -3118,6 +3173,15 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } + if (memory_overflowed) + for (pos = 0; pos < 512; pos ++){ + newmemory[pos].addr = (void *)0; + newmemory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[pos].pos = -1; +#endif + newmemory[pos].lock = 0; + } UNLOCK_COMMAND(&alloc_lock); From 2ba9a567aaaac875be19a76009853b2ee4597dbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:14:59 +0200 Subject: [PATCH 407/681] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d4fdfa465..3825e83ae 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2283,7 +2283,7 @@ static void *alloc_mmap(void *address){ if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; - { else { + } else { new_release_info[release_pos-NUM_BUFFERS].address = map_address; new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; } From 7fd12a5e69164b62dad7fbddf1581d941e5339fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 13:54:51 +0200 Subject: [PATCH 408/681] Add likely() hints for gcc --- driver/others/memory.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 3825e83ae..689aba942 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif + #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS @@ -2111,7 +2119,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2280,7 +2288,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2318,7 +2326,7 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; } else { @@ -2357,7 +2365,7 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; } else { @@ -2391,7 +2399,7 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; } else { @@ -2440,7 +2448,7 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; @@ -2482,7 +2490,7 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; @@ -2594,7 +2602,7 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; } else { @@ -2647,7 +2655,7 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; @@ -3153,7 +3161,7 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { - if (pos < NUM_BUFFERS) + if (likely(pos < NUM_BUFFERS)) release_info[pos].func(&release_info[pos]); else new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); From 89fc5b8f4f1c56b50896773e667c3a215342e49c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 19:50:24 +0200 Subject: [PATCH 409/681] Fix unmap logic --- driver/others/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 689aba942..1f66ef9e9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -76,8 +76,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) #else #define likely(x) (x) +#define unlikely(x) (x) #endif #endif @@ -3097,7 +3099,7 @@ void blas_memory_free(void *free_area){ if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif - if (memory_overflowed) { + if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) position++; // arm: ensure all writes are finished before other thread takes this memory From 1d83ca4bca890536f1c7713a3432a9daf59d2c2c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 03:14:18 +0000 Subject: [PATCH 410/681] Small Matrix: support BFLOAT16 data type --- common_level3.h | 12 ++++ common_macro.h | 18 ++--- common_param.h | 13 ++++ common_sb.h | 12 ++++ interface/gemm.c | 6 +- kernel/Makefile.L3 | 75 ++++++++++++++++++++ kernel/generic/gemm_small_matrix_kernel_nn.c | 4 +- kernel/generic/gemm_small_matrix_kernel_nt.c | 4 +- kernel/generic/gemm_small_matrix_kernel_tn.c | 4 +- kernel/generic/gemm_small_matrix_kernel_tt.c | 4 +- kernel/setparam-ref.c | 5 ++ 11 files changed, 137 insertions(+), 20 deletions(-) diff --git a/common_level3.h b/common_level3.h index 187402a9a..5080ada10 100644 --- a/common_level3.h +++ b/common_level3.h @@ -516,6 +516,13 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xd #endif #ifdef SMALL_MATRIX_OPT +int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + +int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); @@ -530,6 +537,11 @@ int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLO int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); diff --git a/common_macro.h b/common_macro.h index aeb9a205b..cf2a3fd88 100644 --- a/common_macro.h +++ b/common_macro.h @@ -942,17 +942,17 @@ #define GEADD_K SGEADD_K -#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT +#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT -#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN -#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT -#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN -#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT -#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN -#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT -#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN -#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT #endif diff --git a/common_param.h b/common_param.h index 7e8bea4fe..31fba9059 100644 --- a/common_param.h +++ b/common_param.h @@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#ifdef SMALL_MATRIX_OPT + int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif #endif #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) diff --git a/common_sb.h b/common_sb.h index 9976e812e..d21e7a563 100644 --- a/common_sb.h +++ b/common_sb.h @@ -24,6 +24,7 @@ #define SBGEMM_BETA sbgemm_beta #define SBGEMM_KERNEL sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit #else #define SBDOT_K gotoblas -> sbdot_k @@ -41,8 +42,19 @@ #define SBGEMM_BETA gotoblas -> sbgemm_beta #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit #endif +#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn) +#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt) +#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn) +#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt) + +#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn) +#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt) +#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn) +#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt) + #define SBGEMM_NN sbgemm_nn #define SBGEMM_CN sbgemm_tn #define SBGEMM_TN sbgemm_tn diff --git a/interface/gemm.c b/interface/gemm.c index 3497d8651..47e0ca0c3 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -105,7 +105,7 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; -#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) && !defined(BFLOAT16) +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) #define USE_SMALL_MATRIX_OPT 1 #else #define USE_SMALL_MATRIX_OPT 0 @@ -131,8 +131,8 @@ static size_t gemm_small_kernel_b0[] = { GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, }; -#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) -#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) #else static size_t zgemm_small_kernel[] = { diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ef11e391c..404f774cc 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -450,6 +450,15 @@ endif ###### BLAS small matrix optimization ##### ifeq ($(SMALL_MATRIX_OPT), 1) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ @@ -4424,6 +4433,72 @@ $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMM_SMALL_M_PERMIT +SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SBGEMM_SMALL_K_NN +SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_NT +SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_TN +SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_TT +SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SBGEMM_SMALL_K_B0_NN +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +endif + +ifndef SBGEMM_SMALL_K_B0_NT +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +endif + +ifndef SBGEMM_SMALL_K_B0_TN +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +endif + +ifndef SBGEMM_SMALL_K_B0_TT +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +endif + +$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifndef CGEMM_SMALL_M_PERMIT CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c endif diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c index 71700a1fa..b0638c7ea 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c index b287b3837..0a965db58 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nt.c +++ b/kernel/generic/gemm_small_matrix_kernel_nt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c index c41ea7211..69ffc718c 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tn.c +++ b/kernel/generic/gemm_small_matrix_kernel_tn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c index 734510c67..9d68de3f9 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tt.c +++ b/kernel/generic/gemm_small_matrix_kernel_tt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f303d0dc6..19b7b5f0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -112,6 +112,11 @@ gotoblas_t TABLE_NAME = { #else NULL,NULL, #endif +#ifdef SMALL_MATRIX_OPT + sbgemm_small_matrix_permitTS, + sbgemm_small_kernel_nnTS, sbgemm_small_kernel_ntTS, sbgemm_small_kernel_tnTS, sbgemm_small_kernel_ttTS, + sbgemm_small_kernel_b0_nnTS, sbgemm_small_kernel_b0_ntTS, sbgemm_small_kernel_b0_tnTS, sbgemm_small_kernel_b0_ttTS, +#endif #endif #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) From 7d27b182fc6cb2d1b8fc7967c40dd89727fcf875 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 06:10:51 +0000 Subject: [PATCH 411/681] sbgemm: cooperlake: enable SBGEMM by small matrix path --- kernel/x86_64/KERNEL.COOPERLAKE | 10 +++ .../x86_64/sbgemm_block_microk_cooperlake.c | 19 +--- .../sbgemm_microk_cooperlake_template.c | 5 +- .../sbgemm_small_kernel_b0_nn_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_nt_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_tn_cooperlake.c | 2 + .../sbgemm_small_kernel_b0_tt_cooperlake.c | 2 + .../sbgemm_small_kernel_nn_cooperlake.c | 2 + .../sbgemm_small_kernel_nt_cooperlake.c | 2 + .../sbgemm_small_kernel_permit_cooperlake.c | 42 +++++++++ .../sbgemm_small_kernel_template_cooperlake.c | 89 +++++++++++++++++++ .../sbgemm_small_kernel_tn_cooperlake.c | 2 + .../sbgemm_small_kernel_tt_cooperlake.c | 2 + 13 files changed, 162 insertions(+), 19 deletions(-) create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 0b2f3c0ed..151c02d5a 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -1 +1,11 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX + +SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c +SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c +SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c +SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c +SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c +SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c +SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c +SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c +SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 147c5ebdd..2c27221ac 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1,6 +1,5 @@ -//#include "sbgemm.h" - #include + // Walk around those intrinsics that missed by compiler #define MM256_LOADU_EPI16(addr) \ _mm256_maskz_loadu_epi16(~0, (addr)) @@ -1747,7 +1746,7 @@ void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG } // Scale matrix C when beta is not ZERO or ONE -void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) +void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc) { float * C_addr0 = C; float * C_addr1 = C + ldc; @@ -1759,12 +1758,6 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST __m512 array_512_0, array_512_1, array_512_2, array_512_3; __m512 BETAVECTOR = _mm512_set1_ps(beta); - if (Order == CblasRowMajor) { - blasint tmp = M; - M = N; - N = tmp; - } - BLASLONG tag_n_Nx = N & (~3); BLASLONG tag_n_Mx = M & (~15); unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); @@ -1828,7 +1821,7 @@ void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST } // Zero C matrix when Beta is 0 -void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc) +void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc) { float * C_addr0 = C; float * C_addr1 = C + ldc; @@ -1839,12 +1832,6 @@ void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST __m512 ZEROVECTOR = _mm512_setzero_ps(); - if (Order == CblasRowMajor) { - blasint tmp = M; - M = N; - N = tmp; - } - BLASLONG tag_n_Nx = N & (~3); BLASLONG tag_n_Mx = M & (~15); unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index c71595813..b8ed9838e 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -1,8 +1,6 @@ -#include "sbgemm.h" #include "bf16_common_macros.h" #include -/* These macros are needed and should be placed at the right place #define BF16_BLOCK_STEP_N 8 #define BF16_BLOCK_THRES_K 1024 #define BF16_BLOCK_THRES_M 32 @@ -14,7 +12,6 @@ #define ONE 1.e0f #define ZERO 0.e0f -*/ #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT @@ -1798,6 +1795,7 @@ void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, } /* ----------------------------------------- End of TT kernels --------------------------------------- */ +/* #ifndef ONE_ALPHA // ALPHA is not ONE void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) @@ -1836,3 +1834,4 @@ void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_ } } } +*/ diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c new file mode 100644 index 000000000..373457f84 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_nn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c new file mode 100644 index 000000000..0b840c248 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_nt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c new file mode 100644 index 000000000..67542b69c --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_tn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c new file mode 100644 index 000000000..17b5b41c5 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c @@ -0,0 +1,2 @@ +#define B0 1 +#include "./sbgemm_small_kernel_tt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c new file mode 100644 index 000000000..ec40a5054 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c new file mode 100644 index 000000000..1cdfd2936 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NT +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c new file mode 100644 index 000000000..823aafbdd --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + return 1; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c new file mode 100644 index 000000000..d328b0981 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +extern void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc); +extern void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc); + +extern void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); + +#if defined(TRANS_NN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nn_alpha +#elif defined(TRANS_NT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nt_alpha +#elif defined(TRANS_TN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tn_alpha +#elif defined(TRANS_TT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tt_alpha +#endif + +#define BF16_BLOCK_THRES_K 1024 +// If we want to adjust this to be bigger, need to change COL_MAJOR_INCOPY_KERNEL_Kx32 kernel to be bigger also +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + bfloat16 * block_A; + bfloat16 * block_B; + + block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M); + block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K); + +#if defined(B0) + sbgemm_zero_operation(M, N, C, ldc); +#else + sbgemm_scal_operation(M, N, beta, C, ldc); +#endif + + if (alpha == ONE) { + SBGEMM_BLOCKING_KERNEL_ONE(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else { + SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + free(block_A); + free(block_B); + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c new file mode 100644 index 000000000..f1a0d0d0c --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c new file mode 100644 index 000000000..8a2a597bc --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TT +#include "sbgemm_small_kernel_template_cooperlake.c" From 2e44ca0136da2829e1c2e65e2cdd4a8d540491a8 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 00:51:24 +0800 Subject: [PATCH 412/681] sbgemm: add missing cblas_sbgemm definition --- cblas.h | 2 ++ interface/gemm.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cblas.h b/cblas.h index f0220eb99..a5ad25ad7 100644 --- a/cblas.h +++ b/cblas.h @@ -400,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/interface/gemm.c b/interface/gemm.c index 47e0ca0c3..6dcc54041 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -273,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS blasint m, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, - FLOAT *a, blasint lda, - FLOAT *b, blasint ldb, + IFLOAT *a, blasint lda, + IFLOAT *b, blasint ldb, FLOAT beta, FLOAT *c, blasint ldc) { #else From f39301935c27e34acbf95757e644ba6e3ce95cef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 13 Aug 2021 18:43:41 +0800 Subject: [PATCH 413/681] sbgemm: cooperlake: make sure hot buffer aligned to 64 --- .../sbgemm_small_kernel_template_cooperlake.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c index d328b0981..1ab7a34ab 100644 --- a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c +++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c @@ -59,6 +59,10 @@ extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float #define BF16_BLOCK_THRES_M 32 #define BF16_BLOCK_THRES_N 1024 +#define MALLOC_ALIGN64(ptr, size, raw_ptr) \ + raw_ptr = malloc((size) + 63); \ + ptr = (bfloat16 *)(((uintptr_t) raw_ptr + 63) & ~(uintptr_t)63) + #if defined(B0) int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) @@ -68,9 +72,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al { bfloat16 * block_A; bfloat16 * block_B; + void* raw_ptrA; + void* raw_ptrB; - block_A = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M); - block_B = (bfloat16 *) malloc(sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K); + MALLOC_ALIGN64(block_A, sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M, raw_ptrA); + MALLOC_ALIGN64(block_B, sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K, raw_ptrB); #if defined(B0) sbgemm_zero_operation(M, N, C, ldc); @@ -83,7 +89,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT al } else { SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); } - free(block_A); - free(block_B); + + free(raw_ptrA); + free(raw_ptrB); return 0; } From 619588fbabaa0ee470487b9afd063541e95c486b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 30 Aug 2021 17:48:11 +0800 Subject: [PATCH 414/681] sbgemm: remove unnecessary b0 files --- kernel/Makefile.L3 | 16 ++++++++-------- kernel/generic/gemm_small_matrix_kernel_nn.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_nt.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_tn.c | 4 ++-- kernel/generic/gemm_small_matrix_kernel_tt.c | 4 ++-- kernel/x86_64/KERNEL.COOPERLAKE | 8 ++++---- .../sbgemm_small_kernel_b0_nn_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_nt_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_tn_cooperlake.c | 2 -- .../sbgemm_small_kernel_b0_tt_cooperlake.c | 2 -- 10 files changed, 20 insertions(+), 28 deletions(-) delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 404f774cc..49b7c78fb 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4471,32 +4471,32 @@ $(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ ifndef SBGEMM_SMALL_K_B0_NN -SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c endif ifndef SBGEMM_SMALL_K_B0_NT -SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c endif ifndef SBGEMM_SMALL_K_B0_TN -SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c endif ifndef SBGEMM_SMALL_K_B0_TT -SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c endif $(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) - $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ endif ifndef CGEMM_SMALL_M_PERMIT diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c index b0638c7ea..543e7e047 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nn.c +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_nt.c b/kernel/generic/gemm_small_matrix_kernel_nt.c index 0a965db58..d4a7aec6a 100644 --- a/kernel/generic/gemm_small_matrix_kernel_nt.c +++ b/kernel/generic/gemm_small_matrix_kernel_nt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tn.c b/kernel/generic/gemm_small_matrix_kernel_tn.c index 69ffc718c..2747337f2 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tn.c +++ b/kernel/generic/gemm_small_matrix_kernel_tn.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/generic/gemm_small_matrix_kernel_tt.c b/kernel/generic/gemm_small_matrix_kernel_tt.c index 9d68de3f9..eec926bc7 100644 --- a/kernel/generic/gemm_small_matrix_kernel_tt.c +++ b/kernel/generic/gemm_small_matrix_kernel_tt.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef B0 -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) #else -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, IFLOAT * C, BLASLONG ldc) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) #endif { //naive implemtation diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 151c02d5a..6272dd73d 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -2,10 +2,10 @@ include $(KERNELDIR)/KERNEL.SKYLAKEX SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c -SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_b0_nn_cooperlake.c +SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c SBGEMM_SMALL_K_NT = sbgemm_small_kernel_nt_cooperlake.c -SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_b0_nt_cooperlake.c +SBGEMM_SMALL_K_B0_NT = sbgemm_small_kernel_nt_cooperlake.c SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c -SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_b0_tn_cooperlake.c +SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c -SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_b0_tt_cooperlake.c +SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c deleted file mode 100644 index 373457f84..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_nn_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_nn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c deleted file mode 100644 index 0b840c248..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_nt_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_nt_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c deleted file mode 100644 index 67542b69c..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_tn_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_tn_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c deleted file mode 100644 index 17b5b41c5..000000000 --- a/kernel/x86_64/sbgemm_small_kernel_b0_tt_cooperlake.c +++ /dev/null @@ -1,2 +0,0 @@ -#define B0 1 -#include "./sbgemm_small_kernel_tt_cooperlake.c" From 2db1a99aca0177761f47daa71b27450923eb127e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:21:25 +0200 Subject: [PATCH 415/681] Clean up debug messages --- driver/others/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1f66ef9e9..c560c4e90 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - printf("num_buffers exceeded, adding auxiliary array\n"); + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); @@ -3057,9 +3057,9 @@ allocation2: UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); -//#endif +#endif #if defined(WHEREAMI) && !defined(USE_OPENMP) @@ -3110,9 +3110,9 @@ void blas_memory_free(void *free_area){ UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf("Unmap from overflow area succeeded.\n\n"); -//#endif +#endif return; } else { // arm: ensure all writes are finished before other thread takes this memory From cd10d1c03be5ecbdf8bda6e448a6cac27f8aa1be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:38:28 +0200 Subject: [PATCH 416/681] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index c560c4e90..48067923e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); From d1ed72fa87b2c1cdefed4b34682e719a9b326a8c Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 24 Aug 2021 06:09:29 +0100 Subject: [PATCH 417/681] [win/arm64]: Explicit casting for GMEMM_DEFAULT_ALIGN to create 64-bit value Win64 uses LLP64 datamodel and unsigned long is only 32-bit. For 64-bit architecture we need 64-bit mask to correctly generate address --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 634e0ef5d..5250b2f39 100644 --- a/param.h +++ b/param.h @@ -2955,7 +2955,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL #define SYMV_P 16 From 7cddbf99b1dd9f99203daf9430c5d87f4eac6b56 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 31 Aug 2021 14:36:44 +0100 Subject: [PATCH 418/681] Make explicit conversion condition on _WIN64 flag --- param.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/param.h b/param.h index 5250b2f39..07397a66e 100644 --- a/param.h +++ b/param.h @@ -2955,7 +2955,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 +#ifdef _WIN64 +/* Use explicit casting for win64 as LLP64 datamodel is used */ #define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL +#else +#define GEMM_DEFAULT_ALIGN 0x03fffUL +#endif #define SYMV_P 16 From f1e33059746c1fc3a4df76f524c1d4f37f9665b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Sep 2021 21:36:50 +0200 Subject: [PATCH 419/681] Add workaround for Windows10 macro name clash --- kernel/Makefile.L3 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 49b7c78fb..2d274d33b 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -4544,7 +4544,7 @@ $(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ $(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ $(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ @@ -4556,7 +4556,7 @@ $(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ $(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ @@ -4608,7 +4608,7 @@ $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ @@ -4620,7 +4620,7 @@ $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ @@ -4680,7 +4680,7 @@ $(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ $(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ $(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ @@ -4692,7 +4692,7 @@ $(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ $(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ @@ -4744,7 +4744,7 @@ $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ @@ -4756,7 +4756,7 @@ $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) - $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR -DB0 $< -o $@ + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ From af19cda65aef4d033ae33213013c88b0a99f9da2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Sep 2021 18:26:59 +0200 Subject: [PATCH 420/681] Add "recursive" option for IBM xlf compiler (#3359) * Add correct "recursive" option for xlf (from reference-lapack issue 606) --- Makefile.power | 12 ++++++++++++ cmake/fc.cmake | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index 946f55232..4e7478213 100644 --- a/Makefile.power +++ b/Makefile.power @@ -12,9 +12,13 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +ifeq ($(F_COMPILER, IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) @@ -33,7 +37,11 @@ else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -fno-fast-math +endif ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) @@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 631664569..f7aa4c5c9 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -104,7 +104,7 @@ endif () if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") - # FCOMMON_OPT += -qarch=440 + set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -q64") if (INTERFACE64) From 72f3ce5f084c40006e4548ec2a0de2751f5d2dd9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 Sep 2021 20:35:48 +0200 Subject: [PATCH 421/681] Add NO_AVX=1 fallbacks to newer generation x86_64 for completeness (#3360) * Add NO_AVX=1 fallbacks to newer generation x86_64 for completeness * Update .travis.yml --- .travis.yml | 2 +- getarch.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8657b64f4..8a3d2e5bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ # XXX: Precise is already deprecated, new default is Trusty. # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming -dist: precise +dist: focal sudo: true language: c diff --git a/getarch.c b/getarch.c index 6e43616f7..3b08cbfa9 100644 --- a/getarch.c +++ b/getarch.c @@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" #endif +#endif #ifdef FORCE_HASWELL #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ @@ -354,6 +376,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "SKYLAKEX" #define ARCHCONFIG "-DSKYLAKEX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -384,6 +425,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "COOPERLAKE" #define ARCHCONFIG "-DCOOPERLAKE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ From 32fee860330379774a895a18960640120506d317 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Sep 2021 23:44:20 +0200 Subject: [PATCH 422/681] Correct misplaced ifdef lines --- getarch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getarch.c b/getarch.c index 3b08cbfa9..094feaadd 100644 --- a/getarch.c +++ b/getarch.c @@ -372,10 +372,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 #ifdef NO_AVX2 #ifdef NO_AVX #define SUBARCHITECTURE "NEHALEM" @@ -421,10 +421,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_COOPERLAKE -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 #ifdef NO_AVX2 #ifdef NO_AVX #define SUBARCHITECTURE "NEHALEM" From 349fb4910b7ba2069ffe8374c14b06fcf419f7c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Sep 2021 11:19:51 +0200 Subject: [PATCH 423/681] Disable the remaining x86_64 job on Travis --- .travis.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8a3d2e5bb..3dc5fe290 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,24 +7,24 @@ language: c matrix: include: - &test-ubuntu - os: linux +# os: linux compiler: gcc addons: apt: packages: - gfortran - before_script: &common-before - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - script: - - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - - make -C test $COMMON_FLAGS $BTYPE - - make -C ctest $COMMON_FLAGS $BTYPE - - make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64" - - - <<: *test-ubuntu +# before_script: &common-before +# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" +# script: +# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# - make -C test $COMMON_FLAGS $BTYPE +# - make -C ctest $COMMON_FLAGS $BTYPE +# - make -C utest $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64" +# +# - <<: *test-ubuntu os: linux-ppc64le before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" From 8c68b6f26d1030f2bb932d8b885cb8d076a84437 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Sep 2021 11:40:40 +0200 Subject: [PATCH 424/681] Update .travis.yml --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3dc5fe290..85a57f6e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,8 +26,13 @@ matrix: # # - <<: *test-ubuntu os: linux-ppc64le - before_script: + before_script: &common-before - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + script: + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE env: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX From 4c294336e6bc1b249721c0d9f0ee210d010db9f9 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 10 Aug 2021 03:23:45 +0000 Subject: [PATCH 425/681] sbgemm: cooperlake: add dummy source files --- kernel/x86_64/KERNEL.COOPERLAKE | 11 +++++++ kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c | 32 ++++++++++++++++++ kernel/x86_64/sbgemm_ncopy_32_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_ncopy_8_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_tcopy_32_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sbgemm_tcopy_8_cooperlake.c | 33 +++++++++++++++++++ kernel/x86_64/sgemm_beta_skylakex.c | 2 +- 7 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_ncopy_32_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_ncopy_8_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_tcopy_32_cooperlake.c create mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 6272dd73d..197907261 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c + +SBGEMM_BETA = sgemm_beta_skylakex.c +SBGEMMKERNEL = sbgemm_kernel_32x8_cooperlake.c +SBGEMMINCOPY = sbgemm_ncopy_32_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_32_cooperlake.c +SBGEMMONCOPY = sbgemm_ncopy_8_cooperlake.c +SBGEMMOTCOPY = sbgemm_tcopy_8_cooperlake.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c new file mode 100644 index 000000000..ea2600067 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c @@ -0,0 +1,32 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ +} diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c new file mode 100644 index 000000000..afcf6f647 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + +} diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 1c29c1168..6217acf48 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -41,7 +41,7 @@ #include int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ BLASLONG i, j; From ef8f5fecc8f532081eb63ded20da650b57e78e54 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 10 Aug 2021 06:14:45 +0000 Subject: [PATCH 426/681] sbgemm: cooperlake: implement sbgemm_tcopy_32 --- kernel/x86_64/sbgemm_tcopy_32_cooperlake.c | 108 +++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c index afcf6f647..3e37473ca 100644 --- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c @@ -26,8 +26,116 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *boffset; + + boffset = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table = { + 0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7, + 8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset, a0); + _mm512_storeu_si512(boffset + 32, a1); + _mm512_storeu_si512(boffset + 64, a2); + _mm512_storeu_si512(boffset + 96, a3); + boffset += 128; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset, a0); + _mm512_storeu_si512(boffset + 32, a1); + boffset += 64; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + _mm512_storeu_si512(boffset, a0); + boffset += 32; + } + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + __mmask16 w_mask = (1UL << (remains - 16)) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset, a0); + _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1); + boffset += 2 * remains; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + _mm512_mask_storeu_epi32(boffset, w_mask, a0); + boffset += 2 * remains; + } + } + for (; i < m; i++) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm512_mask_storeu_epi16(boffset, r_mask, a0); + boffset += remains; + } + } } From 2ec9f3a8aa67e7b36612bc8faf34397e2a968b27 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 12 Aug 2021 01:46:49 +0000 Subject: [PATCH 427/681] sbgemm: cooperlake: change kernel size to 16x4 --- kernel/x86_64/KERNEL.COOPERLAKE | 10 +- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 126 +++++++++++ kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c | 32 --- ...perlake.c => sbgemm_ncopy_16_cooperlake.c} | 0 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c | 207 ++++++++++++++++++ ...perlake.c => sbgemm_tcopy_16_cooperlake.c} | 73 +++--- ...operlake.c => sbgemm_tcopy_4_cooperlake.c} | 0 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c | 33 --- 8 files changed, 385 insertions(+), 96 deletions(-) create mode 100644 kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c delete mode 100644 kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c rename kernel/x86_64/{sbgemm_ncopy_32_cooperlake.c => sbgemm_ncopy_16_cooperlake.c} (100%) create mode 100644 kernel/x86_64/sbgemm_ncopy_4_cooperlake.c rename kernel/x86_64/{sbgemm_tcopy_32_cooperlake.c => sbgemm_tcopy_16_cooperlake.c} (71%) rename kernel/x86_64/{sbgemm_ncopy_8_cooperlake.c => sbgemm_tcopy_4_cooperlake.c} (100%) delete mode 100644 kernel/x86_64/sbgemm_tcopy_8_cooperlake.c diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE index 197907261..dba94aea8 100644 --- a/kernel/x86_64/KERNEL.COOPERLAKE +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -11,11 +11,11 @@ SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c SBGEMM_BETA = sgemm_beta_skylakex.c -SBGEMMKERNEL = sbgemm_kernel_32x8_cooperlake.c -SBGEMMINCOPY = sbgemm_ncopy_32_cooperlake.c -SBGEMMITCOPY = sbgemm_tcopy_32_cooperlake.c -SBGEMMONCOPY = sbgemm_ncopy_8_cooperlake.c -SBGEMMOTCOPY = sbgemm_tcopy_8_cooperlake.c +SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c +SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c +SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c +SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c +SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c new file mode 100644 index 000000000..05ba015d2 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) +#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) +#define BROADCAST64(base, step, n, offset, zmm) \ + if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + +#define DECLARE_A_PAIR(A) \ + __m512i A_lo_##A; __m512i A_hi_##A; + +#define LOAD_A_PAIR(A) \ + VMOVLDUP(ptr_a##A, A_lo_##A); \ + VMOVHDUP(ptr_a##A, A_hi_##A); + +#define LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_loadu_si256(ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define DECLARE_B_PAIR() \ + __m512i B_lo; __m512i B_hi; + +#define BROADCAST_B_PAIR(Bx, By) \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi); + +#define BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcastd_epi32(xmm); \ + B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define DECLARE_RESULT_4X(A, Bx, By) \ + __m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_11_##A##Bx##By = _mm512_setzero_ps(); + +#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b) + +#define MATMUL_4X(A, Bx, By) \ + FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \ + FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \ + FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ + FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); + +#define STORE_4X(A, Bx, By) + + + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + BLASLONG n_count = n; + BLASLONG m_count, k_count; + BLASLONG n_blksize = 4 * k; + + for (; n_count > 23; n_count -= 24) { + m_count = m; + ptr_b0 = ptr_b; + ptr_b1 = ptr_b0 + n_blksize * 3; + for (; m_count > 15; m_count -= 16) { + DECLARE_A_PAIR(0); DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 24 * 2; + ptr_b1 += 24 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 24; + ptr_b1 += 24; + ptr_a0 += 16; + } + } + } +} diff --git a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c b/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c deleted file mode 100644 index ea2600067..000000000 --- a/kernel/x86_64/sbgemm_kernel_32x8_cooperlake.c +++ /dev/null @@ -1,32 +0,0 @@ -/*************************************************************************** -Copyright (c) 2021, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -{ -} diff --git a/kernel/x86_64/sbgemm_ncopy_32_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c similarity index 100% rename from kernel/x86_64/sbgemm_ncopy_32_cooperlake.c rename to kernel/x86_64/sbgemm_ncopy_16_cooperlake.c diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c new file mode 100644 index 000000000..523e3b48f --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define REORDER_4x32(r0, r1, r2, r3) {\ + __m512i t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + r0 = _mm512_unpacklo_epi64(t0, t2); \ + r1 = _mm512_unpackhi_epi64(t0, t2); \ + r2 = _mm512_unpacklo_epi64(t1, t3); \ + r3 = _mm512_unpackhi_epi64(t1, t3); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \ + t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \ + t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \ + r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \ + r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \ + r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \ + r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) {\ + __m128i t0, t1, t2, t3; \ + t0 = _mm_unpacklo_epi32(r0, r1); \ + t1 = _mm_unpackhi_epi32(r0, r1); \ + t2 = _mm_unpacklo_epi32(r2, r3); \ + t3 = _mm_unpackhi_epi32(r2, r3); \ + r0 = _mm_unpacklo_epi64(t0, t2); \ + r1 = _mm_unpackhi_epi64(t0, t2); \ + r2 = _mm_unpacklo_epi64(t1, t3); \ + r3 = _mm_unpackhi_epi64(t1, t3); \ +} + +#define GET_TAIL(tail, remain_m) \ + switch((remain_m + 1)/2) { \ + case 1: tail = r0; break; \ + case 2: tail = r1; break; \ + case 3: tail = r2; break; \ + case 4: tail = r3; break; \ + } + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *aoffset; + IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; + + IFLOAT *boffset; + + aoffset = a; + boffset = b; + + BLASLONG m32 = n & ~31; + BLASLONG m8 = n & ~7; + BLASLONG n4 = n & ~3; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + + for (j = 0; j < n4; j += 4) { + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset += 4 * lda; + + for (i = 0; i < m32; i += 32) { + __m512i r0, r1, r2, r3; + r0 = _mm512_loadu_si512(aoffset0 + i); + r1 = _mm512_loadu_si512(aoffset1 + i); + r2 = _mm512_loadu_si512(aoffset2 + i); + r3 = _mm512_loadu_si512(aoffset3 + i); + REORDER_4x32(r0, r1, r2, r3); + _mm512_storeu_si512(boffset + 32*0, r0); + _mm512_storeu_si512(boffset + 32*1, r1); + _mm512_storeu_si512(boffset + 32*2, r2); + _mm512_storeu_si512(boffset + 32*3, r3); + boffset += 32 * 4; + } + for (; i < m8; i += 8) { + __m128i r0 = _mm_loadu_si128(aoffset0 + i); + __m128i r1 = _mm_loadu_si128(aoffset1 + i); + __m128i r2 = _mm_loadu_si128(aoffset2 + i); + __m128i r3 = _mm_loadu_si128(aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + _mm_storeu_si128(boffset + 8*0, r0); + _mm_storeu_si128(boffset + 8*1, r1); + _mm_storeu_si128(boffset + 8*2, r2); + _mm_storeu_si128(boffset + 8*3, r3); + boffset += 8 * 4; + } + if (i < m) { + int remain_m = m - i; + __mmask8 r_mask = (1UL << remain_m) - 1; + __m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i); + __m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i); + __m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i); + __m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + + // store should skip the tail odd line + int num_store = remain_m/2; + switch(num_store) { + case 3: _mm_storeu_si128(boffset + 8*2, r0); + case 2: _mm_storeu_si128(boffset + 8*1, r0); + case 1: _mm_storeu_si128(boffset + 8*0, r0); + } + boffset += 8 * num_store; + + if (m & 0x1) { // handling the tail + __m128i tail; + GET_TAIL(tail, remain_m); + /* tail vector is fill with zero like: + * a, 0, b, 0, c, 0, d, 0 + * need to extract lo words of data and store + */ + tail = _mm_cvtepi32_epi16(tail); + _mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid + boffset += 4; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask8 nmask = (1UL << remain_n) - 1; + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + __m128i r0, r1, r2, r3; + for (i = 0; i < m8; i += 8) { + switch (remain_n) { + case 3: r2 = _mm_loadu_si128(aoffset2 + i); + case 2: r1 = _mm_loadu_si128(aoffset1 + i); + case 1: r0 = _mm_loadu_si128(aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); + _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); + _mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3); + boffset += 4 * remain_n; + } + if (i < m) { + int remain_m = m - i; + __mmask8 mmask = (1UL << remain_m) - 1; + switch (remain_n) { + case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i); + case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i); + case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + + int num_store = remain_m/2; + switch (num_store) { + case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); + case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); + case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + } + boffset += 2 * num_store * remain_n; + + if (m & 0x1) { + __m128i tail; + GET_TAIL(tail, remain_m); + tail = _mm_cvtepi32_epi16(tail); + _mm_mask_storeu_epi16(boffset, nmask, tail); + } + } + } +} diff --git a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c similarity index 71% rename from kernel/x86_64/sbgemm_tcopy_32_cooperlake.c rename to kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index 3e37473ca..16bf48f0b 100644 --- a/kernel/x86_64/sbgemm_tcopy_32_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -32,23 +32,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - IFLOAT *boffset; + IFLOAT *boffset0, *boffset1; - boffset = b; + boffset0 = b; BLASLONG n32 = n & ~31; BLASLONG m4 = m & ~3; BLASLONG m2 = m & ~1; uint32_t permute_table = { - 0, 0x10|0, 1, 0x10|1, 2, 0x10|2, 3, 0x10|3, 4, 0x10|4, 5, 0x10|5, 6, 0x10|6, 7, 0x10, 7, - 8, 0x10|8, 9, 0x10|9, 10, 0x10|10, 11, 0x10|11, 12, 0x10|12, 13, 0x10|13, 14, 0x10|14, 15, 0x10|15, + 0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17, + 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f, }; __m512i idx_lo = _mm512_loadu_si512(permute_table); __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; for (i = 0; i < m4; i += 4) { /* bf16 fma need special memory layout: * for memory layout like below: @@ -72,11 +74,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); - _mm512_storeu_si512(boffset, a0); - _mm512_storeu_si512(boffset + 32, a1); - _mm512_storeu_si512(boffset + 64, a2); - _mm512_storeu_si512(boffset + 96, a3); - boffset += 128; + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; } for (; i < m2; i += 2) { __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); @@ -88,22 +91,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - _mm512_storeu_si512(boffset, a0); - _mm512_storeu_si512(boffset + 32, a1); - boffset += 64; + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; } for (; i < m; i++) { /* just copy the only remains row */ - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - _mm512_storeu_si512(boffset, a0); - boffset += 32; + __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256(boffset0, a0); + _mm256_storeu_si256(boffset1, a1); + boffset0 += 16; + boffset1 += 16; } + boffset0 = boffset1; } if (j < n) { uint32_t remains = n - j; __mmask32 r_mask = (1UL << remains) - 1; if (remains > 16) { - __mmask16 w_mask = (1UL << (remains - 16)) - 1; + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; for (i = 0; i < m2; i += 2) { __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); @@ -114,9 +124,19 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - _mm512_storeu_si512(boffset, a0); - _mm512_mask_storeu_epi32(boffset + 32, w_mask, a1); - boffset += 2 * remains; + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256(boffset0, a0); + _mm256_mask_storeu_epi16(boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; } } else { __mmask16 w_mask = (1UL << remains ) - 1; @@ -128,14 +148,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ __m512i a01 = _mm512_unpackhi_epi16(a0, a1); a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - _mm512_mask_storeu_epi32(boffset, w_mask, a0); - boffset += 2 * remains; + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; } - } - for (; i < m; i++) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - _mm512_mask_storeu_epi16(boffset, r_mask, a0); - boffset += remains; } } } diff --git a/kernel/x86_64/sbgemm_ncopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c similarity index 100% rename from kernel/x86_64/sbgemm_ncopy_8_cooperlake.c rename to kernel/x86_64/sbgemm_tcopy_4_cooperlake.c diff --git a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c deleted file mode 100644 index afcf6f647..000000000 --- a/kernel/x86_64/sbgemm_tcopy_8_cooperlake.c +++ /dev/null @@ -1,33 +0,0 @@ -/*************************************************************************** -Copyright (c) 2021, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - -} From 9df0953cde0833644155eb6f22d241fc773504a8 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 16 Aug 2021 19:39:24 +0800 Subject: [PATCH 428/681] sbgemm: cooperlake: kernel works for NN --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 375 +++++++++++++++++- kernel/x86_64/sbgemm_ncopy_4_cooperlake.c | 51 +-- kernel/x86_64/sbgemm_tcopy_16_cooperlake.c | 259 ++++++------ 3 files changed, 515 insertions(+), 170 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 05ba015d2..d604235c9 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -31,8 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) #define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) #define BROADCAST64(base, step, n, offset, zmm) \ - if (n == 0) asm("vbroadcastsd %2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ - else asm("vbroadcastsd %4(%1, %2, %3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) #define DECLARE_A_PAIR(A) \ __m512i A_lo_##A; __m512i A_hi_##A; @@ -41,8 +41,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. VMOVLDUP(ptr_a##A, A_lo_##A); \ VMOVHDUP(ptr_a##A, A_hi_##A); +#define MASK_LOAD_A_PAIR(A) { \ + __m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \ +} + #define LOAD_A_PAIR_TAIL(A) { \ - __m256i ymm = _mm256_loadu_si256(ptr_a##A); \ + __m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define MASK_LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \ __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ @@ -53,13 +66,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BROADCAST_B_PAIR(Bx, By) \ BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ - BROADCAST64(ptr_b##Bx, n_blksize, By, 2, B_hi); + BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); + +#define MASK_BROADCAST_B_PAIR(Bx, x) {\ + __m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \ + B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \ + B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \ +} #define BROADCAST_B_PAIR_TAIL(Bx, By) {\ - __m128i xmm = (__m128i) _mm_load_sd(ptr_b##Bx + n_blksize * By); \ + __m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \ xmm = _mm_cvtepu16_epi32(xmm); \ - B_lo = _mm512_broadcastd_epi32(xmm); \ - B_hi = _mm512_broadcastd_epi32((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ } #define DECLARE_RESULT_4X(A, Bx, By) \ @@ -76,25 +102,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); -#define STORE_4X(A, Bx, By) +#define _STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \ + asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) + +#define _MASK_STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + +#define _REORDER_C_2X(result_0, result_1) { \ + __m512 tmp0, tmp1; \ + tmp0 = _mm512_unpacklo_ps(result_0, result_1); \ + tmp1 = _mm512_unpackhi_ps(result_0, result_1); \ + result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \ + result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \ +} + +#define _STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define STORE_4X(A, Bx, By) { \ + _STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define MASK_STORE_4X(A, Bx, By) { \ + _MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define _STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); +#define _MASK_STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + +#define N_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + +#define N_MASK_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) { - IFLOAT *ptr_a = A, *ptr_b = B, *ptr_c = C; + IFLOAT *ptr_a = A, *ptr_b = B; IFLOAT *ptr_b0, *ptr_b1; IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c0, *ptr_c1; BLASLONG n_count = n; BLASLONG m_count, k_count; BLASLONG n_blksize = 4 * k; + BLASLONG cn_offset = 0; + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); for (; n_count > 23; n_count -= 24) { + IFLOAT *ptr_b00 = ptr_b; + IFLOAT *ptr_b10 = ptr_b + n_blksize * 3; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; m_count = m; - ptr_b0 = ptr_b; - ptr_b1 = ptr_b0 + n_blksize * 3; for (; m_count > 15; m_count -= 16) { - DECLARE_A_PAIR(0); DECLARE_B_PAIR(); + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { @@ -105,8 +209,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 24 * 2; - ptr_b1 += 24 * 2; + ptr_b0 += 4 * 2; + ptr_b1 += 4 * 2; ptr_a0 += 16 * 2; } if (k_count > 0) { @@ -117,10 +221,249 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 24; - ptr_b1 += 24; + ptr_b0 += 4; + ptr_b1 += 4; ptr_a0 += 16; } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 4 * 2; + ptr_b1 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b0 += 4; + ptr_b1 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2); + ptr_c += m_count; + } + ptr_b += 24 * k; + cn_offset += 24; + } + for (; n_count > 11; n_count -= 12) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 31; m_count -= 32) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); DECLARE_A_PAIR(1); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); LOAD_A_PAIR(1); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4; + ptr_a0 += 16; + ptr_a1 += 16; + } + ptr_c0 = ptr_c; + ptr_c1 = ptr_c + 16; + STORE_4X(0, 0, 0); STORE_4X(1, 0, 0); + STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); + STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); + ptr_c += 16 * 2; + } + if (m > 31) { + ptr_a0 = ptr_a1; + } + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + ptr_c += m_count; + } + ptr_b += 12 * k; + cn_offset += 12; + } + for (; n_count > 3; n_count -= 4) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + ptr_b += 4 * k; + cn_offset += 4; + } + if (n_count > 0) { + __mmask8 nmask = (1UL << n_count) - 1; + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + N_STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + N_MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; } } + return 0; } diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c index 523e3b48f..eefbd7355 100644 --- a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -79,8 +79,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset = a; boffset = b; - BLASLONG m32 = n & ~31; - BLASLONG m8 = n & ~7; + BLASLONG m32 = m & ~31; + BLASLONG m8 = m & ~7; BLASLONG n4 = n & ~3; int permute_table[] = { @@ -115,15 +115,15 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset += 32 * 4; } for (; i < m8; i += 8) { - __m128i r0 = _mm_loadu_si128(aoffset0 + i); - __m128i r1 = _mm_loadu_si128(aoffset1 + i); - __m128i r2 = _mm_loadu_si128(aoffset2 + i); - __m128i r3 = _mm_loadu_si128(aoffset3 + i); + __m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + __m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + __m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + __m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i)); REORDER_4x8(r0, r1, r2, r3); - _mm_storeu_si128(boffset + 8*0, r0); - _mm_storeu_si128(boffset + 8*1, r1); - _mm_storeu_si128(boffset + 8*2, r2); - _mm_storeu_si128(boffset + 8*3, r3); + _mm_storeu_si128((void *)(boffset + 8*0), r0); + _mm_storeu_si128((void *)(boffset + 8*1), r1); + _mm_storeu_si128((void *)(boffset + 8*2), r2); + _mm_storeu_si128((void *)(boffset + 8*3), r3); boffset += 8 * 4; } if (i < m) { @@ -138,9 +138,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ // store should skip the tail odd line int num_store = remain_m/2; switch(num_store) { - case 3: _mm_storeu_si128(boffset + 8*2, r0); - case 2: _mm_storeu_si128(boffset + 8*1, r0); - case 1: _mm_storeu_si128(boffset + 8*0, r0); + case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2); + case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1); + case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0); } boffset += 8 * num_store; @@ -152,7 +152,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ * need to extract lo words of data and store */ tail = _mm_cvtepi32_epi16(tail); - _mm_store_sd(boffset, (__m128d) tail); // only lower 4 bfloat valid + _mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid boffset += 4; } } @@ -167,16 +167,16 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ __m128i r0, r1, r2, r3; for (i = 0; i < m8; i += 8) { switch (remain_n) { - case 3: r2 = _mm_loadu_si128(aoffset2 + i); - case 2: r1 = _mm_loadu_si128(aoffset1 + i); - case 1: r0 = _mm_loadu_si128(aoffset0 + i); + case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i)); } REORDER_4x8(r0, r1, r2, r3); - _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); - _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); - _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); - _mm_mask_storeu_epi16(boffset + remain_n * 3, nmask, r3); - boffset += 4 * remain_n; + _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + _mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3); + boffset += 8 * remain_n; } if (i < m) { int remain_m = m - i; @@ -190,9 +190,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ int num_store = remain_m/2; switch (num_store) { - case 3: _mm_mask_storeu_epi16(boffset + remain_n * 2, nmask, r2); - case 2: _mm_mask_storeu_epi16(boffset + remain_n * 1, nmask, r1); - case 1: _mm_mask_storeu_epi16(boffset + remain_n * 0, nmask, r0); + case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); } boffset += 2 * num_store * remain_n; @@ -204,4 +204,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ } } } + return 0; } diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index 16bf48f0b..ce4458d2c 100644 --- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -29,134 +29,135 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG i, j; - - IFLOAT *boffset0, *boffset1; - - boffset0 = b; - - BLASLONG n32 = n & ~31; - BLASLONG m4 = m & ~3; - BLASLONG m2 = m & ~1; - - uint32_t permute_table = { - 0x00, 0x10, 0x01, 0x11, 0x02, 0x12, 0x03, 0x13, 0x04, 0x14, 0x05, 0x15, 0x06, 0x16, 0x07, 0x17, - 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f, - }; - - __m512i idx_lo = _mm512_loadu_si512(permute_table); - __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); - - for (j = 0; j < n32; j += 32) { - /* process 2x16 n at the same time */ - boffset1 = boffset0 + m * 16; - for (i = 0; i < m4; i += 4) { - /* bf16 fma need special memory layout: - * for memory layout like below: - * a00, a01, a02, a03, a04, a05 .... - * a10, a11, a12, a13, a14, a15 .... - * need to copy as: - * a00, a10, a01, a11, a02, a12, a03, a13, ... - */ - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); - __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); - __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - __m512i a10 = _mm512_unpacklo_epi16(a2, a3); - __m512i a11 = _mm512_unpackhi_epi16(a2, a3); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); - a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); - - _mm512_storeu_si512(boffset0, a0); - _mm512_storeu_si512(boffset1, a1); - _mm512_storeu_si512(boffset0 + 32, a2); - _mm512_storeu_si512(boffset1 + 32, a3); - boffset0 += 64; - boffset1 += 64; - } - for (; i < m2; i += 2) { - __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); - __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - - _mm512_storeu_si512(boffset0, a0); - _mm512_storeu_si512(boffset1, a1); - boffset0 += 32; - boffset1 += 32; - } - for (; i < m; i++) { - /* just copy the only remains row */ - __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); - __m256i a1 = _mm256_loadu_si256(&a[(i + 0)*lda + j + 16]); - _mm256_storeu_si256(boffset0, a0); - _mm256_storeu_si256(boffset1, a1); - boffset0 += 16; - boffset1 += 16; - } - boffset0 = boffset1; - } - if (j < n) { - uint32_t remains = n - j; - __mmask32 r_mask = (1UL << remains) - 1; - if (remains > 16) { - boffset1 = boffset0 + m * 16; - uint32_t tail1 = remains - 16; - __mmask16 w_mask1 = (1UL << tail1) - 1; - for (i = 0; i < m2; i += 2) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); - - _mm512_storeu_si512(boffset0, a0); - _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); - - boffset0 += 32; - boffset1 += 2 * tail1; - } - for (; i < m; i++) { - __m256i a0 = _mm256_loadu_si256(&a[(i + 0)*lda + j]); - __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, &a[(i + 0)*lda + j + 16]); - _mm256_storeu_si256(boffset0, a0); - _mm256_mask_storeu_epi16(boffset1, w_mask1, a1); - boffset0 += 16; - boffset1 += tail1; - } - } else { - __mmask16 w_mask = (1UL << remains ) - 1; - for (i = 0; i < m2; i += 2) { - __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); - __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); - - __m512i a00 = _mm512_unpacklo_epi16(a0, a1); - __m512i a01 = _mm512_unpackhi_epi16(a0, a1); - - a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); - - _mm512_mask_storeu_epi32(boffset0, w_mask, a0); - boffset0 += 2 * remains; - } - for (; i < m; i++) { - __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); - _mm256_mask_storeu_epi16(boffset0, w_mask, a0); - boffset0 += remains; - } - } - } + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table[] = { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_storeu_si256((void *)boffset1, a1); + boffset0 += 16; + boffset1 += 16; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; + } + } + } } From 8356a604f0bab4844827a1b622aa5c481157bd4b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 19:35:40 +0800 Subject: [PATCH 429/681] sbgemm: cooperlake: tuning for block params --- driver/others/parameter.c | 1 + param.h | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 36da13369..d7dbddc7c 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -524,6 +524,7 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/param.h b/param.h index 07397a66e..48770fa7a 100644 --- a/param.h +++ b/param.h @@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define USE_SGEMM_KERNEL_DIRECT 1 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_P 384 +#define SBGEMM_DEFAULT_Q 768 +#define SBGEMM_DEFAULT_R sbgemm_r + #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 From cece3541ab739f94add22fda840276033d0feb97 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 21:13:29 +0800 Subject: [PATCH 430/681] sbgemm: cooperlake: fix bug in m64n12 --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index d604235c9..c257a3f60 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -306,9 +306,8 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); ptr_c += 16 * 2; - } - if (m > 31) { ptr_a0 = ptr_a1; + ptr_a1 = ptr_a0 + 16 * k; } for (; m_count > 15; m_count -= 16) { ptr_b0 = ptr_b00; From 45fdf951b64aa9145996727ecda901f00a2eda3c Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 22:08:24 +0800 Subject: [PATCH 431/681] sbgemm: cooperlake: reorder ptr increase for performance --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index c257a3f60..4c1f50650 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -203,27 +203,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4 * 2; ptr_b1 += 4 * 2; - ptr_a0 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4; ptr_b1 += 4; - ptr_a0 += 16; } ptr_c0 = ptr_c; STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); @@ -240,27 +240,27 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); for (k_count = k; k_count > 1; k_count -=2) { MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4 * 2; ptr_b1 += 4 * 2; - ptr_a0 += m_count * 2; } if (k_count > 0) { MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); - ptr_b0 += 4; ptr_b1 += 4; - ptr_a0 += m_count; } ptr_c0 = ptr_c; MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); @@ -284,21 +284,21 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); LOAD_A_PAIR(1); + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += 16 * 2; - ptr_a1 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + ptr_a0 += 16; + ptr_a1 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); ptr_b0 += 4; - ptr_a0 += 16; - ptr_a1 += 16; } ptr_c0 = ptr_c; ptr_c1 = ptr_c + 16; @@ -316,19 +316,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += 16 * 2; } if (k_count > 0) { LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4; - ptr_a0 += 16; } ptr_c0 = ptr_c; STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); @@ -342,19 +342,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); for (k_count = k; k_count > 1; k_count -=2) { MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - ptr_a0 += m_count * 2; } if (k_count > 0) { MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4; - ptr_a0 += m_count; } ptr_c0 = ptr_c; MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); From 7a2d1601ec84c146b01eeb227d65b51c7855d1ef Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 17 Aug 2021 23:21:19 +0800 Subject: [PATCH 432/681] sbgemm: cooperlake: unroll core loop by 2 --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 4c1f50650..0280b441e 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -201,7 +201,31 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * DECLARE_B_PAIR(); DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); - for (k_count = k; k_count > 1; k_count -=2) { + k_count = k; + for (; k_count > 3; k_count -=4) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + for (; k_count > 1; k_count -=2) { LOAD_A_PAIR(0); ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); From bb1c4fa5bdf93724075ed400e3ff5bbdabd0b31a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 18 Aug 2021 21:17:08 +0800 Subject: [PATCH 433/681] sbgemm: cooperlake: prefetch A & B --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 0280b441e..7af51b6d8 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -64,6 +64,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DECLARE_B_PAIR() \ __m512i B_lo; __m512i B_hi; +#define PREFETCH_B_STEP 32 +#define PREFETCH_B(Bx, By) \ + if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \ + else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2)) + #define BROADCAST_B_PAIR(Bx, By) \ BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); @@ -204,17 +209,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * k_count = k; for (; k_count > 3; k_count -=4) { LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); ptr_a0 += 16 * 2; - BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); - BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); - BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2); ptr_b0 += 4 * 2; - BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); - BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); - BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2); ptr_b1 += 4 * 2; LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); ptr_a0 += 16 * 2; BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); From 5fcacad32bb71fd6c6e04e078eeaf59120a9ba72 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 19 Aug 2021 00:08:06 +0800 Subject: [PATCH 434/681] sbgemm: cooperlake: implement tcopy_4 --- kernel/x86_64/sbgemm_tcopy_16_cooperlake.c | 1 + kernel/x86_64/sbgemm_tcopy_4_cooperlake.c | 86 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c index ce4458d2c..88725f343 100644 --- a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -160,4 +160,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ } } } + return 0; } diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c index afcf6f647..74f30d44a 100644 --- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -26,8 +26,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n8 = n & ~7; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + for (j = 0; j < n8; j += 8) { + boffset1 = boffset0 + m * 4; + for (i = 0; i < m4; i +=4) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); + __m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + __m128i a10 = _mm_unpacklo_epi16(a2, a3); + __m128i a11 = _mm_unpackhi_epi16(a2, a3); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset0 + 8), a10); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + _mm_storeu_si128((void *)(boffset1 + 8), a11); + boffset0 += 16; + boffset1 += 16; + } + for (; i < m2; i+= 2) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + __m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, a0); + _mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1)); + boffset0 += 4; + boffset1 += 4; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask8 r_mask = (1UL << remains) - 1; + if (remains > 4) { + boffset1 = boffset0 + m * 4; + uint32_t tail1 = remains - 4; + __mmask8 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)boffset0, a00); + _mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01); + boffset0 += 8; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, (__m128d) a0); + _mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1)); + boffset0 += 4; + boffset1 += tail1; + } + } else { + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + _mm_mask_storeu_epi32((void *)boffset0, r_mask, a00); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_mask_storeu_epi16((void *)boffset0, r_mask, a0); + } + } + } + return 0; } From beccb83b167b50e3742aa113aab51e57d0e9baa2 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 19 Aug 2021 19:46:08 +0800 Subject: [PATCH 435/681] sbgemm: cooperlake: add n24 kernel for tcopy_4 --- kernel/x86_64/sbgemm_tcopy_4_cooperlake.c | 101 +++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c index 74f30d44a..e9edd4571 100644 --- a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -29,6 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#define STORE_VEC(Bx, By, vec) \ + if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \ + else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2)); + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; @@ -36,13 +40,106 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset0 = b; + BLASLONG n24 = n - (n % 24); BLASLONG n8 = n & ~7; + BLASLONG m8 = m & ~7; BLASLONG m4 = m & ~3; BLASLONG m2 = m & ~1; - for (j = 0; j < n8; j += 8) { + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + j = 0; + if (n > 23) { + /* n = 24 is the max width in current blocking setting */ + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + __mmask32 mask24 = (1UL << 24) - 1; + BLASLONG blk_size = m * 4; + BLASLONG stride = blk_size * 3; + + for (; j < n24; j += 24) { + boffset1 = boffset0 + stride; + for (i = 0; i < m8; i += 8) { + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]); + r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]); + r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]); + r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]); + r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]); + r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]); + + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + t2 = _mm512_unpacklo_epi16(r2, r3); + t3 = _mm512_unpackhi_epi16(r2, r3); + t4 = _mm512_unpacklo_epi16(r4, r5); + t5 = _mm512_unpackhi_epi16(r4, r5); + t6 = _mm512_unpacklo_epi16(r6, r7); + t7 = _mm512_unpackhi_epi16(r6, r7); + + r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2); + r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3); + r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6); + r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7); + r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2); + r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3); + r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6); + r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7); + + t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2); + t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3); + t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6); + t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7); + t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2); + t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3); + + STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2); + STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m2; i += 2) { + __m512i r0, r1, t0, t1; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0)); + STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0)); + STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1)); + STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1)); + STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2)); + STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2)); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + *(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0]; + *(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4]; + *(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8]; + *(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12]; + *(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16]; + *(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20]; + boffset0 += 4; + boffset1 += 4; + } + boffset0 += stride * 2; + } + } + + for (; j < n8; j += 8) { boffset1 = boffset0 + m * 4; - for (i = 0; i < m4; i +=4) { + for (i = 0; i < m4; i += 4) { __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); From 682d66555d050dd31a48e5337815b5e1422d8f80 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 20 Aug 2021 22:01:00 +0800 Subject: [PATCH 436/681] sbgemm: cooperlake: implement ncopy_16 --- kernel/x86_64/sbgemm_ncopy_16_cooperlake.c | 320 +++++++++++++++++++++ 1 file changed, 320 insertions(+) diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c index afcf6f647..95ed82d7c 100644 --- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -26,8 +26,328 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include +#include #include "common.h" +#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \ + asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8)) + +#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \ + __m512i v; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + t4 = _mm512_unpacklo_epi32(r4, r5); \ + t5 = _mm512_unpackhi_epi32(r4, r5); \ + t6 = _mm512_unpacklo_epi32(r6, r7); \ + t7 = _mm512_unpackhi_epi32(r6, r7); \ + _MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_epi32(kc, t0, v); \ + r1 = _mm512_mask_blend_epi32(k3, t2, v); \ + _MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_epi32(kc, t1, v); \ + r3 = _mm512_mask_blend_epi32(k3, t3, v); \ + _MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_epi32(kc, t4, v); \ + r5 = _mm512_mask_blend_epi32(k3, t6, v); \ + _MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_epi32(kc, t5, v); \ + r7 = _mm512_mask_blend_epi32(k3, t7, v); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \ +} + +#define STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_storeu_si512(boffset0 + x*32, v); + +#define STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_storeu_si512(boffset0 + (x + 8)*32, v); + +#define MASK_STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v); + +#define MASK_STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v); + +#define STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { STORE_512_LO(y); } \ + else { STORE_512_HI(y); } \ +} + +#define MASK_STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { MASK_STORE_512_LO(y); } \ + else { MASK_STORE_512_HI(y); } \ +} + +#define SET_TAIL(y, x) {\ + if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ +} + +#define GET_TAIL() \ + switch (n_store + 1) { \ + case 16: SET_TAIL(1, 7); break; \ + case 15: SET_TAIL(1, 6); break; \ + case 14: SET_TAIL(1, 5); break; \ + case 13: SET_TAIL(1, 4); break; \ + case 12: SET_TAIL(1, 3); break; \ + case 11: SET_TAIL(1, 2); break; \ + case 10: SET_TAIL(1, 1); break; \ + case 9: SET_TAIL(1, 0); break; \ + case 8: SET_TAIL(0, 7); break; \ + case 7: SET_TAIL(0, 6); break; \ + case 6: SET_TAIL(0, 5); break; \ + case 5: SET_TAIL(0, 4); break; \ + case 4: SET_TAIL(0, 3); break; \ + case 3: SET_TAIL(0, 2); break; \ + case 2: SET_TAIL(0, 1); break; \ + case 1: SET_TAIL(0, 0); break; \ + } + + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0; + IFLOAT *aoffset; + IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07; + IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17; + aoffset = a; + boffset0 = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + u_int64_t permute_table2[] = { + 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, + 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo2 = _mm512_loadu_si512(permute_table2); + __m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t00, t01, t02, t03, t04, t05, t06, t07; + __m512i t10, t11, t12, t13, t14, t15, t16, t17; + + for (j = 0; j < n16; j += 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + r0 = _mm512_loadu_si512(aoffset00 + i); + r1 = _mm512_loadu_si512(aoffset01 + i); + r2 = _mm512_loadu_si512(aoffset02 + i); + r3 = _mm512_loadu_si512(aoffset03 + i); + r4 = _mm512_loadu_si512(aoffset04 + i); + r5 = _mm512_loadu_si512(aoffset05 + i); + r6 = _mm512_loadu_si512(aoffset06 + i); + r7 = _mm512_loadu_si512(aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_loadu_si512(aoffset10 + i); + r1 = _mm512_loadu_si512(aoffset11 + i); + r2 = _mm512_loadu_si512(aoffset12 + i); + r3 = _mm512_loadu_si512(aoffset13 + i); + r4 = _mm512_loadu_si512(aoffset14 + i); + r5 = _mm512_loadu_si512(aoffset15 + i); + r6 = _mm512_loadu_si512(aoffset16 + i); + r7 = _mm512_loadu_si512(aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3); + STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7); + STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3); + STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7); + boffset0 += 16 * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: STORE_512(1, 6); + case 14: STORE_512(1, 5); + case 13: STORE_512(1, 4); + case 12: STORE_512(1, 3); + case 11: STORE_512(1, 2); + case 10: STORE_512(1, 1); + case 9: STORE_512(1, 0); + case 8: STORE_512(0, 7); + case 7: STORE_512(0, 6); + case 6: STORE_512(0, 5); + case 5: STORE_512(0, 4); + case 4: STORE_512(0, 3); + case 3: STORE_512(0, 2); + case 2: STORE_512(0, 1); + case 1: STORE_512(0, 0); + } + boffset0 += n_store * 32; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail)); + boffset0 += 16; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask16 nmask = (1UL << remain_n) - 1; + int load0, load1; + if (remain_n > 8) { + load0 = 8; + load1 = remain_n - 8; + } else { + load0 = remain_n; + load1 = 0; + } + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + switch (load0) { + case 8: r7 = _mm512_loadu_si512(aoffset07 + i); + case 7: r6 = _mm512_loadu_si512(aoffset06 + i); + case 6: r5 = _mm512_loadu_si512(aoffset05 + i); + case 5: r4 = _mm512_loadu_si512(aoffset04 + i); + case 4: r3 = _mm512_loadu_si512(aoffset03 + i); + case 3: r2 = _mm512_loadu_si512(aoffset02 + i); + case 2: r1 = _mm512_loadu_si512(aoffset01 + i); + case 1: r0 = _mm512_loadu_si512(aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_loadu_si512(aoffset17 + i); + case 7: r6 = _mm512_loadu_si512(aoffset16 + i); + case 6: r5 = _mm512_loadu_si512(aoffset15 + i); + case 5: r4 = _mm512_loadu_si512(aoffset14 + i); + case 4: r3 = _mm512_loadu_si512(aoffset13 + i); + case 3: r2 = _mm512_loadu_si512(aoffset12 + i); + case 2: r1 = _mm512_loadu_si512(aoffset11 + i); + case 1: r0 = _mm512_loadu_si512(aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7); + MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3); + MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7); + boffset0 += remain_n * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + switch (load0) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: MASK_STORE_512(1, 6); + case 14: MASK_STORE_512(1, 5); + case 13: MASK_STORE_512(1, 4); + case 12: MASK_STORE_512(1, 3); + case 11: MASK_STORE_512(1, 2); + case 10: MASK_STORE_512(1, 1); + case 9: MASK_STORE_512(1, 0); + case 8: MASK_STORE_512(0, 7); + case 7: MASK_STORE_512(0, 6); + case 6: MASK_STORE_512(0, 5); + case 5: MASK_STORE_512(0, 4); + case 4: MASK_STORE_512(0, 3); + case 3: MASK_STORE_512(0, 2); + case 2: MASK_STORE_512(0, 1); + case 1: MASK_STORE_512(0, 0); + } + boffset0 += n_store * remain_n * 2; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail)); + } + } + } + return 0; } From 59a1114d03b59794ae46eb6ae60b9a3b4b842709 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 18:12:40 +0800 Subject: [PATCH 437/681] sbgemm: cooperlake: tuning for small matrix --- kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c index 823aafbdd..70becd9fa 100644 --- a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) { - return 1; + double MNK = (double) M * (double) N * (double) K; + if (MNK > 256.0*256.0*256.0) // disable for big size matrix + return 0; + /* small matrix kernel works well for N = 8, 16, 32 */ + if (N == 8 || N == 16 || N == 32) + return 1; + return 0; } From 4289cf048dc1b5b735f65a3183f2c903c8f090bc Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 18:34:26 +0800 Subject: [PATCH 438/681] sbgemm: avoid falling into SGEMM_KERNEL_DIRECT --- interface/gemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemm.c b/interface/gemm.c index 6dcc54041..71cc77a1b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif From 045ed5c91df1e4d330ff1a3e93a721f98552692b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 7 Sep 2021 23:37:08 +0800 Subject: [PATCH 439/681] sbgemm: fix build error in BFLOAT16 disabled --- driver/others/parameter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d7dbddc7c..791e5dc27 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -524,7 +524,9 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; @@ -630,7 +632,9 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; From b858e65476b0ece1ccd082c62dd23d5ff1cb44b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 Sep 2021 10:51:59 +0200 Subject: [PATCH 440/681] migrate from deprecated ubuntu-16.04 vmImage --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b1bded639..5d4a1ecd3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,7 @@ jobs: # of gcc / glibc - job: manylinux1_gcc pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | echo "FROM quay.io/pypa/manylinux1_x86_64 @@ -35,7 +35,7 @@ jobs: displayName: Run manylinux1 docker build - job: Intel_SDE_skx pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | # at the time of writing the available Azure Ubuntu vm image From 7f4aa106f27d11cfa7e394238f222cca4f93d1bd Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 8 Sep 2021 07:04:13 -0500 Subject: [PATCH 441/681] Fixing syntax error in makefile Fixing syntax issue in Makefile.power added by recent commit af19cda65aef4d033ae33213013c88b0a99f9da2 --- Makefile.power | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.power b/Makefile.power index 4e7478213..28a0bae08 100644 --- a/Makefile.power +++ b/Makefile.power @@ -12,7 +12,7 @@ endif ifeq ($(CORE), POWER10) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math -ifeq ($(F_COMPILER, IBM) +ifeq ($(F_COMPILER), IBM) FCOMMON_OPT += -O2 -qrecur -qnosave else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math From d17238599b573350b166973619039e67fba12fdd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 13:38:28 +0200 Subject: [PATCH 442/681] Add casts --- kernel/x86_64/dasum_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/sasum_microk_haswell-2.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 4fc73ddd4..fd9da7ebe 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); for (i = 0; i < tail_index_AVX2; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 8e6cb9a47..2eb5b9538 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); for (i = 0; i < tail_index_AVX2; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; From 20581bf303776f831c788ced24f179d720ec5c39 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:36:27 +0200 Subject: [PATCH 443/681] Remove unused variable --- interface/zsyr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index 71d4dbf29..c70bd819e 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; FLOAT * ALPHA = α FLOAT alpha_r = ALPHA[0]; From ef2471203068b64d648b1495c9399bc18e802788 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:37:44 +0200 Subject: [PATCH 444/681] Move a conditionally used variable --- kernel/generic/dot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index 5abbb735c..84568ee0b 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; #if V_SIMD && !defined(DSDOT) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } dot = v_sum_f32(vsum0); #elif defined(DSDOT) + int n1 = n & -4; for (; i < n1; i += 4) { dot += (double) y[i] * (double) x[i] @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + (double) y[i+3] * (double) x[i+3] ; } #else + int n1 = n & -4; for (; i < n1; i += 4) { dot += y[i] * x[i] From 7d873a329f477c676b39719d4f83a87a506cc0b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 14:38:47 +0200 Subject: [PATCH 445/681] Add ifdefs around conditionally used functions --- kernel/x86_64/sgemv_n_4.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 90865c4b3..0d8cada75 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT #endif +#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL + #ifndef HAVE_KERNEL_4x2 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -246,6 +248,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a #endif +#endif + static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) From 1085775bc68c7de6e4a93c0d920b5564c8e84706 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 15:05:55 +0200 Subject: [PATCH 446/681] really remove the unused variable --- interface/zsyr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index c70bd819e..54fb8a4e9 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; From 0925dfe2c9a287f1fadfd20ea718e89b722c4de0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Sep 2021 15:30:19 +0200 Subject: [PATCH 447/681] One instance of kernel_4x1 is used even on SKX --- kernel/x86_64/sgemv_n_4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 0d8cada75..e0778006f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -172,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#endif #endif #ifndef HAVE_KERNEL_4x1 @@ -248,8 +249,6 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a #endif -#endif - static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) From 5e4f1e3677df7ca74fd9d3dd264de8ca095f0553 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:09:46 +0200 Subject: [PATCH 448/681] Remove BFLOAT16 from the task list of GenerateNamedObject --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0330b2ce7..ef7457135 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,7 +132,7 @@ endif () if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing + # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") From 1c0a8a714a5b00b1773c8a91b9cd155007b10480 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:10:58 +0200 Subject: [PATCH 449/681] Add defaults for SBGEMV kernels --- cmake/kernel.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0c102bae5..09ca5eb57 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -134,6 +134,8 @@ if (BUILD_BFLOAT16) set(SHSWAPKERNEL ../arm/swap.c) set(TOBF16KERNEL ../x86_64/tobf16.c) set(BF16TOKERNEL ../x86_64/bf16to.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () From e02df9fc55d96388951901420d6be9ff9e404228 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:12:27 +0200 Subject: [PATCH 450/681] Propagate BUILD_BFLOAT16 to CFLAGS --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7d2672998..f56ded966 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -469,6 +469,9 @@ endif() if (BUILD_COMPLEX16) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() +if (BUILD_BFLOAT16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() From 5f6a6092537f156d14e11bd5cd6f6b15c3f861ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:13:57 +0200 Subject: [PATCH 451/681] Add sbgemv --- driver/level2/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 61367e596..3e9964ab1 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) endif () +# special defines for complex if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") foreach (u_source ${U_SOURCES}) @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () +if (BUILD_BFLOAT16) + if (USE_THREAD) + GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") + endif () +endif () + if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") From 2f8220d757e9db0d4b748232cbdb2582ff64f611 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:14:43 +0200 Subject: [PATCH 452/681] Add sbgemm --- driver/level3/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 077862abc..75b25d039 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") + endif () + endif () endforeach () if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) From c35739db5ee784ba5a210441b0f30962a2f36b01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:15:57 +0200 Subject: [PATCH 453/681] Add separate entries for BFLOAT16 functions and fix missing cblas_xerbla --- interface/CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5346ecadd..ccb5fce3f 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") @@ -104,6 +105,15 @@ endif () GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) +if (BUILD_BFLOAT16) + GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") +endif () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) From ddf106f769637cbfa09ee3c3dbe3bfe4cb04ef56 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 16:17:18 +0200 Subject: [PATCH 454/681] Add dedicated entries for BFLOAT16 kernels --- kernel/CMakeLists.txt | 105 ++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 25 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d8a230436..9ffbd944f 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + # sbdot + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") + endif() + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") + endif () # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) @@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE BFLOAT16) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - if (NOT ${BUILD_BFLOAT16}) - continue () - else () - set (float_char "SB") - endif () - endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) @@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + if (SBGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () + if (BUILD_BFLOAT16) + if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) + set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NN) + set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NT) + set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TN) + set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TT) + set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) + set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) + set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) + set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) + set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + endif () endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) @@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") @@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () - if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () - - if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") endif () From ce036a2fc0a593a780a7ecd12933afd93e265e85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 21:41:53 +0200 Subject: [PATCH 455/681] Add casts --- kernel/x86_64/dasum_microk_skylakex-2.c | 8 ++++---- kernel/x86_64/sasum_microk_skylakex-2.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index aea8c02d9..83bc078b3 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index c8c69d1e0..fbc91b558 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; From dd09f0173e90f98ec382ef5ce1ddf4d1eb7c67e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Sep 2021 21:52:26 +0200 Subject: [PATCH 456/681] Remove extraneous qualifiers from struct definition --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 48067923e..0185fa683 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2695,7 +2695,7 @@ static volatile struct { } memory[NUM_BUFFERS]; -static volatile struct newmemstruct +struct newmemstruct { BLASULONG lock; void *addr; From b751edf6248e1897d1966d4693b2be980b89f518 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 15 Sep 2021 13:36:07 -0500 Subject: [PATCH 457/681] Fix unused variable warnings on Power --- kernel/power/drot.c | 4 +--- kernel/power/idamax.c | 2 +- kernel/power/trsm_kernel_LN_power10.c | 1 - kernel/power/trsm_kernel_LT_power10.c | 1 - kernel/power/zgemv_n_4.c | 1 - kernel/power/zgemv_n_power10.c | 1 - 6 files changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 3229878e4..30c7411cc 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -110,8 +110,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT { BLASLONG i=0; BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; FLOAT temp; if ( n <= 0 ) return(0); @@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) { - drot_kernel_16(n1, x1, y1, c, s); + drot_kernel_16(n1, x, y, c, s); i=n1; } #endif diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5016f67dd..f1ef00066 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { - BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c index 5ca1603a6..246c3a236 100644 --- a/kernel/power/trsm_kernel_LN_power10.c +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[120] = (c0[15] *= a[255]); b[121] = (c1[15] *= a[255]); diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c index 14ff12fe4..51f3a4e61 100644 --- a/kernel/power/trsm_kernel_LT_power10.c +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, vector FLOAT *Vc6 = (vector FLOAT *) c6; vector FLOAT *Vc7 = (vector FLOAT *) c7; vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; - int j; b[0] = (c0[0] *= a[0]); b[1] = (c1[0] *= a[0]); diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 1f7199c89..366c21681 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c index f5bb8d70e..a545b00d8 100644 --- a/kernel/power/zgemv_n_power10.c +++ b/kernel/power/zgemv_n_power10.c @@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; From 99aa10b3ff8870f4718fc842ce80871247cb93af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Sep 2021 22:10:43 +0200 Subject: [PATCH 458/681] Initialize abs_mask1 with itself to silence a gcc warning actual initialization is via the _mm_cmpeq_ep18, which I've seen claimed to be the fastest way to set an xmm register to all 1s --- kernel/x86_64/casum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d51929f9f..b398aa6e1 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1; + __m128 abs_mask1 = abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); From 8dfa61a61c0b6d1f9a742e3dc2ae455bb3703cc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Sep 2021 22:11:35 +0200 Subject: [PATCH 459/681] Initialize abs_mask1 with itself to silence a gcc warning --- kernel/x86_64/zasum_microk_skylakex-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index b44c53801..e257a5456 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1; + __m128d abs_mask1 = abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); From 0e8b4adf22981f3bd8f80e7e1f9e58edec54a598 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 15 Sep 2021 22:18:48 +0000 Subject: [PATCH 460/681] Remove unused commented code (#if directive) --- driver/others/dynamic_power.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index d9c15b312..2847ea9ae 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ -// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) -//#define HAVE_P10_SUPPORT 1 -//#endif #ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif From 7d4a2215799772a4d81a3d3e3b8d7faa515c68b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:18:25 +0200 Subject: [PATCH 461/681] Remove unused TEMP2 and reshuffle to leave x18 unused (reserved on OSX) --- kernel/arm64/dgemm_tcopy_8.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 9ab51ff57..7e5bf6080 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B03 x16 #define B04 x17 -#define I x18 -#define J x19 +#define I x19 +#define J x20 -#define TEMP1 x20 -#define TEMP2 x21 +#define TEMP1 x21 #define A_PREFETCH 2560 #define B_PREFETCH 256 From 0a4ac4b5850b5dee9f285637f06a4594f2e10dc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:19:51 +0200 Subject: [PATCH 462/681] Use x21 for I to leave x18 unused (reserved on OSX) --- kernel/arm64/sgemm_tcopy_16.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 46198b3a2..431f1ae2a 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -30,7 +30,7 @@ All rights reserved. #define B00 x22 -#define I x18 +#define I x21 #define J x19 #define TEMP1 x20 From 7d751774465637c25ef45d8c0f2a2361553e3df4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:24:11 +0200 Subject: [PATCH 463/681] Move temp to x21 to leave x18 unused (reserved on OSX) --- kernel/arm64/dtrmm_kernel_8x4.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 0ac5a5f24..3d953266c 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha x17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 d10 #define alphaV0 v10.d[0] From 380940271b7647cc82000b4f34d681a3259d222f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:28:19 +0200 Subject: [PATCH 464/681] Move temp to x21 to leave x18 unused (reserved on OSX) --- kernel/arm64/strmm_kernel_16x4.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 985a0a9a6..a44326aeb 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha w17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 s10 #define alphaV0 v10.s[0] From 590fbff06e818c3135a0b80cfae5a471da7f4e09 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:42:17 +0200 Subject: [PATCH 465/681] move alpha to x19/x20 to leave x18 unused for OSX --- kernel/arm64/zgemm_kernel_4x4.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index f8e877f3c..a65c4f581 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 -#define alphaR x17 -#define alphaI x18 +#define alphaR x19 +#define alphaI x20 #define alpha0_R d10 #define alphaV0_R v10.d[0] From 90cc944625ce0405145bdde03af0bf4e19e3f1ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Sep 2021 09:53:18 +0200 Subject: [PATCH 466/681] Move alphaI to x22 to leave x18 unused (reserved on OSX) --- kernel/arm64/ztrmm_kernel_4x4.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 462acfe2b..cd053b896 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR x17 -#define alphaI x18 +#define alphaI x22 #define temp x19 #define tempOffset x20 #define tempK x21 From 5c537a5de07909f66c64cd8128c4a44df6ac8ba4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Sep 2021 14:54:35 +0200 Subject: [PATCH 467/681] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 88a5a5035..6ce85e08e 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. +- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS - **Cortex-A53**: same as ARMV8 (different cpu specifications) +- **Cortex-A55**: same as ARMV8 (different cpu specifications) - **Cortex A57**: Optimized Level-3 and Level-2 functions - **Cortex A72**: same as A57 ( different cpu specifications) - **Cortex A73**: same as A57 (different cpu specifications) @@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th #### RISC-V -- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. +- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` + (also known to work on C906) ### Support for multiple targets in a single library From b7bb2e36b8b8197bf4ae794b0982dde0336e17bc Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Sun, 26 Sep 2021 12:17:21 +0300 Subject: [PATCH 468/681] Makefile.system: adjust mipsel/mips64el ARCH variables When building for MIPS{64} little-endian variants, the included makefiles should be the same as for the big-endian. There are already some adjustments being done for some ARCH names. This change adds the ones for the `mipsel` and `mips64el` names, so that the Makefile.mips{64} files get included. This comes as a result of: https://github.com/openwrt/packages/issues/16649 Signed-off-by: Alexandru Ardelean --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20db80d07..150dbef50 100644 --- a/Makefile.system +++ b/Makefile.system @@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7) override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 +else ifeq ($(ARCH), mipsel) +override ARCH=mips +else ifeq ($(ARCH), mips64el) +override ARCH=mips64 else ifeq ($(ARCH), zarch) override ARCH=zarch endif From ee5ca8a328bae3da45a15452e9772c67165fabe0 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 28 Sep 2021 18:22:15 +0800 Subject: [PATCH 469/681] x86_64: BFLOAT16: fix build warning --- kernel/x86_64/bf16_common_macros.h | 36 ++++---- kernel/x86_64/sbdot_microk_cooperlake.c | 14 +-- .../x86_64/sbgemm_block_microk_cooperlake.c | 2 +- .../sbgemv_n_microk_cooperlake_template.c | 11 ++- .../sbgemv_t_microk_cooperlake_template.c | 91 +++++++++++++------ 5 files changed, 100 insertions(+), 54 deletions(-) diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h index 78db7abb2..cdb4beff6 100644 --- a/kernel/x86_64/bf16_common_macros.h +++ b/kernel/x86_64/bf16_common_macros.h @@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ @@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ - reg = _mm256_loadu_si256(x + idx_n); + reg = _mm256_loadu_si256((__m256i *)(x + idx_n)); #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ - reg = _mm_loadu_si128(x + idx_n); + reg = _mm_loadu_si128((__m128i *)(x + idx_n)); #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 067726cb1..2aefe46ff 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) __m256 accum256_1 = _mm256_setzero_ps(); int tail_index_32 = n&(~31); for (int j = 0; j < tail_index_32; j += 32) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); - accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16])); } accum256 = _mm256_add_ps(accum256, accum256_1); /* Processing the remaining <32 chunk with 16-elements processing */ if ((n&16) != 0) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32])); } accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } else if (n > 15) { /* n range from 16 to 31 */ /* Processing <32 chunk with 16-elements processing */ __m256 accum256 = _mm256_setzero_ps(); - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0])); accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } } else if (n > 7) { /* n range from 8 to 15 */ /* Processing <16 chunk with 8-elements processing */ - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0])); /* Processing the remaining <8 chunk with masked 8-elements processing */ if ((n&7) != 0) { diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c index 2c27221ac..b8c41f4f7 100644 --- a/kernel/x86_64/sbgemm_block_microk_cooperlake.c +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat // K=Any number but will be processed based on 32, M<=16 void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) { - bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * src_addr0; bfloat16 * dst_addr0, * dst_addr1; BLASLONG tag_k_32x = k & (~31); diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 46e6d0ff9..4711e9720 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); accum512_0 = _mm512_setzero_ps(); diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 51e681add..8a3a022fb 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char load_mask_value = (((unsigned char)0xff) >> 6); @@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num == 8) { __m256 result256 = _mm256_setzero_ps(); - __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2 __m256i xArray256 = _mm512_castsi512_si256(xArray); result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); @@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); @@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); result256_1 = _mm256_setzero_ps(); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element - matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row @@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, if (tail_num > 10) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row @@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num > 5) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows matrixArray256_2 = _mm256_setzero_si256(); @@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512 result_0, result_1; @@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element - matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element - matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element // Process the 0|1 elements // Select the 0|1 elements for each row @@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, { BLASLONG tag_m_16x = m & (~15); - __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m128 result128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { result128 = _mm_setzero_ps(); - matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8 result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); tmp128 = _mm_shuffle_ps(result128, result128, 14); result128 = _mm_add_ps(result128, tmp128); @@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| if (tag_m_14x > 0) { @@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI16_2 = _mm256_set1_epi16(2); @@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| if (tag_m_12x > 0) { @@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI32_1 = _mm256_set1_epi32(1); @@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x { BLASLONG tag_m_16x = m & (~15); - __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + __m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m128 accum128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { accum256 = _mm256_setzero_ps(); - matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16 accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); @@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ - accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; __m512 accum512_bridge[8]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; @@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; @@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif #endif __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ @@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m128 accum128, tmp128; for (BLASLONG i = tag_m_8x; i < m; i++) { accum256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); From 2d33e12a119f0cf97e5c41ff4f6499e9229d9bd5 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Thu, 30 Sep 2021 03:14:15 -0400 Subject: [PATCH 470/681] Make sure that Netlib LAPACK respects FFLAGS OpenBLAS allows users to specify `FFLAGS` and then uses `override` to append additional options. However, without such an override in lapack's make.inc, lapack would use the external FFLAGS, rather than the ones being computed by OpenBLAS. For example the `DEBUG=1` flag would not apply to LAPACK code. This is all a bit messy but forced by the integration with netlib lapack. Note that `CFLAGS` already has this override for the same reason. It is possible that other variables here should have a similar override, but I think for most of the other ones, OpenBLAS's build system does not append to the flags passed in by the user. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 555d1c467..49fd57ff2 100644 --- a/Makefile +++ b/Makefile @@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc From fe3c778c51de8ca541b3ad158e783d6826128312 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Thu, 30 Sep 2021 06:06:27 -0500 Subject: [PATCH 471/681] AIX changes for P10 with GNU Compiler --- kernel/power/KERNEL.POWER10 | 5 +- kernel/power/caxpy_microk_power10.c | 29 ++- kernel/power/ccopy_microk_power10.c | 45 +++- kernel/power/cdot.c | 4 +- kernel/power/cdot_microk_power10.c | 8 + kernel/power/cgemm_kernel_power10.S | 36 ++- kernel/power/cgemm_macros_power10.S | 306 ++++++++++++++++++++++++++ kernel/power/cscal_microk_power10.c | 4 + kernel/power/cswap.c | 4 +- kernel/power/dasum.c | 7 +- kernel/power/dgemv_n_microk_power10.c | 86 +++++++- kernel/power/dgemv_t_power10.c | 36 ++- kernel/power/drot.c | 6 +- kernel/power/dscal.c | 8 +- kernel/power/dswap.c | 6 +- kernel/power/sasum.c | 6 +- kernel/power/srot.c | 6 +- kernel/power/sscal.c | 8 +- kernel/power/sswap.c | 6 +- kernel/power/zaxpy_microk_power10.c | 8 + kernel/power/zgemm_kernel_power10.S | 4 +- kernel/power/zgemm_macros_power10.S | 301 +++++++++++++++++++++---- kernel/power/zgemv_t_4.c | 2 +- kernel/power/zscal.c | 6 +- kernel/power/zscal_microk_power10.c | 37 +++- kernel/power/zswap.c | 4 +- param.h | 5 - 27 files changed, 852 insertions(+), 131 deletions(-) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 873653f1e..50866c974 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,6 +1,3 @@ -ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -include $(KERNELDIR)/KERNEL.POWER8 -else #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c @@ -44,6 +41,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c @@ -219,4 +217,3 @@ QCABS_KERNEL = ../generic/cabs.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -endif diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 56a5ab47a..902eba82c 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, #endif const float *mvecp = mvec; /* We have to load reverse mask for big endian. */ - /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + long ytmp; __asm__ @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else "stxv 49, 0(%4) \n\t" "stxv 48, 16(%4) \n\t" "stxv 51, 32(%4) \n\t" @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "stxv 34, 80(%4) \n\t" "stxv 39, 96(%4) \n\t" "stxv 38, 112(%4) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c index 6c80f9cd4..f30e1fa09 100644 --- a/kernel/power/ccopy_microk_power10.c +++ b/kernel/power/ccopy_microk_power10.c @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) ".align 5 \n" "one%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 36, 80(%3) \n\t" "stxv 39, 96(%3) \n\t" "stxv 38, 112(%3) \n\t" +#endif "lxvp 32, 0(%2) \n\t" "lxvp 34, 32(%2) \n\t" "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 41, 128(%3) \n\t" "stxv 40, 144(%3) \n\t" "stxv 43, 160(%3) \n\t" @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" +#endif "lxvp 40, 128(%2) \n\t" "lxvp 42, 160(%2) \n\t" "lxvp 44, 192(%2) \n\t" @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "bgt one%= \n" "two%=: \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else "stxv 33, 0(%3) \n\t" "stxv 32, 16(%3) \n\t" "stxv 35, 32(%3) \n\t" @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "stxv 44, 208(%3) \n\t" "stxv 47, 224(%3) \n\t" "stxv 46, 240(%3) \n\t" - +#endif "#n=%1 x=%4=%2 y=%0=%3" : "=m" (*y), diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index b9e2d2ce5..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) #include "cdot_microk_power10.c" #else #ifndef HAVE_KERNEL_8 @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) BLASLONG n1 = n & -16; #else BLASLONG n1 = n & -8; diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c index 399f2b180..9d42559c9 100644 --- a/kernel/power/cdot_microk_power10.c +++ b/kernel/power/cdot_microk_power10.c @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_8 (long n, float *x, float *y, float *dot) { +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) "xxswapd 33, 34 \n\t" "xvaddsp 35, 35, 32 \n\t" "xvaddsp 34, 34, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xxpermdi 34, 35, 34, 0 \n\t" +#else "xxpermdi 34, 34, 35, 2 \n\t" +#endif "stxv 34, 0(%6) \n\t" "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S index e04f948dd..fbd22aaad 100644 --- a/kernel/power/cgemm_kernel_power10.S +++ b/kernel/power/cgemm_kernel_power10.S @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_power10.S" +#if (_AIX) +.set perm_const1, 0x0405060700010203 +.set perm_const2, 0x0c0d0e0f08090a0b +.set save_permute_12, 0x1011121300010203 +.set save_permute_11, 0x18191a1b08090a0b +#else .equ perm_const1, 0x0405060700010203 .equ perm_const2, 0x0c0d0e0f08090a0b .equ save_permute_12, 0x0c0d0e0f1c1d1e1f .equ save_permute_11, 0x0405060714151617 - +#endif #ifndef NEEDPARAM @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ - +#if (_AIX) + lis T2, (perm_const2>>48 & 0xFFFF) + lis T1, (perm_const1>>48 & 0xFFFF) + lis T3, (save_permute_12>>48 & 0xFFFF) + lis T4, (save_permute_11>>48 & 0xFFFF) + + ori T2, T2, (perm_const2>>32 & 0xFFFF) + ori T1, T1, (perm_const1>>32 & 0xFFFF) + ori T3, T3, (save_permute_12>>32 & 0xFFFF) + ori T4, T4, (save_permute_11>>32 & 0xFFFF) +#else lis T2, perm_const2@highest lis T1, perm_const1@highest lis T3, save_permute_12@highest lis T4, save_permute_11@highest - ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher ori T3, T3, save_permute_12@higher ori T4, T4, save_permute_11@higher - +#endif rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 rldicr T3, T3, 32, 31 rldicr T4, T4, 32, 31 +#if (_AIX) + oris T2, T2, (perm_const2>>16 & 0xFFFF) + oris T1, T1, (perm_const1>>16 & 0xFFFF) + oris T3, T3, (save_permute_12>>16 & 0xFFFF) + oris T4, T4, (save_permute_11>>16 & 0xFFFF) + + ori T2, T2, (perm_const2 & 0xFFFF) + ori T1, T1, (perm_const1 & 0xFFFF) + ori T3, T3, (save_permute_12 & 0xFFFF) + ori T4, T4, (save_permute_11 & 0xFFFF) +#else oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h oris T3, T3, save_permute_12@h @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ori T1, T1, perm_const1@l ori T3, T3, save_permute_12@l ori T4, T4, save_permute_11@l - +#endif li r0,0 li PRE,512 diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S index b66e93405..f75bf5dad 100644 --- a/kernel/power/cgemm_macros_power10.S +++ b/kernel/power/cgemm_macros_power10.S @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .endm .macro LOAD4x8_2 @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 42, 38 + xvf32gerpp 2, 43, 38 + xvf32gerpp 1, 40, 38 + xvf32gerpp 0, 41, 38 + xvf32gerpp 7, 42, 39 + xvf32gerpp 6, 43, 39 + xvf32gerpp 5, 40, 39 + xvf32gerpp 4, 41, 39 +#else xvf32gerpp 3, 42, 39 xvf32gerpp 2, 43, 39 xvf32gerpp 1, 40, 39 @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 43, 38 xvf32gerpp 5, 40, 38 xvf32gerpp 4, 41, 38 +#endif .if \Complete==0 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) MULT_APLHA_PART1 vs48, vs56, vs0, vs1 @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs32, vs32, vs3 xvaddsp vs33, vs33, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs40, vs40, vs7 xvaddsp vs41, vs41, vs5 xvaddsp vs34, vs34, vs11 xvaddsp vs35, vs35, vs9 xvaddsp vs42, vs42, vs15 xvaddsp vs43, vs43, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs33, vs0, vs8, 1 + xxpermdi vs32, vs2, vs10, 1 + xxpermdi vs41, vs4, vs12, 1 + xxpermdi vs40, vs6, vs14, 1 + xxpermdi vs35, vs8, vs0, 1 + xxpermdi vs34, vs10, vs2, 1 + xxpermdi vs43, vs12, vs4, 1 + xxpermdi vs42, vs14, vs6, 1 #else xxpermdi vs33, vs8, vs0, 2 xxpermdi vs32, vs10, vs2, 2 @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs34, vs2, vs10, 2 xxpermdi vs43, vs4, vs12, 2 xxpermdi vs42, vs6, vs14, 2 +#endif #endif stxvp vs32, 0(T2) stxvp vs40, 32(T2) @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .endm .macro LOAD4x4_2 @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 39 + xvf32gerpp 2, 37, 39 + xvf32gerpp 1, 36, 38 + xvf32gerpp 0, 37, 38 +#else xvf32gerpp 3, 36, 38 xvf32gerpp 2, 37, 38 xvf32gerpp 1, 36, 39 xvf32gerpp 0, 37, 39 +#endif .if \Complete==0 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs7, vs14, vs6, 2 xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs29, vs29, vs5 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 + xxpermdi vs29, vs4, vs12, 1 + xxpermdi vs28, vs6, vs14, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs14, vs6, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 32 xvf32gerpp 0, 35, 32 +#endif .endm .macro LOAD4x2_2 @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 33 xvf32gerpp 0, 35, 33 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 37, 33 + xvf32gerpp 0, 36, 33 +#else xvf32gerpp 1, 36, 32 xvf32gerpp 0, 37, 32 +#endif .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs2, vs10, 0 + xxpermdi vs3, vs8, vs0, 3 + xxpermdi vs11, vs10, vs2, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs10, vs2, 0 xxpermdi vs3, vs0, vs8, 3 xxpermdi vs11, vs2, vs10, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 xvaddsp vs25, vs25, vs3 xvaddsp vs27, vs27, vs11 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs2, vs10, 0 + xxpermdi vs25, vs8, vs0, 3 + xxpermdi vs27, vs10, vs2, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs10, vs2, 0 xxpermdi vs25, vs0, vs8, 3 xxpermdi vs27, vs2, vs10, 3 +#endif #endif stxv vs24, 0(CO) stxv vs25, 0(T1) @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .endm .macro LOAD4x1_2 @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x1_2O OffsetA, OffsetB lxv vs32, (\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 36, 33 + xvf32gerpp 1, 37, 33 +#else xvf32gerpp 0, 37, 33 xvf32gerpp 1, 36, 33 +#endif .if \Complete==0 lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif .endif .if \IsLast==1 .if \Complete==1 @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 2, 37, 35 xvf32gerpp 3, 36, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 41, 35 + xvf32gerpp 3, 40, 35 + xvf32gerpp 0, 39, 35 + xvf32gerpp 1, 38, 35 +#else xvf32gerpp 2, 41, 34 xvf32gerpp 3, 40, 34 xvf32gerpp 0, 39, 34 xvf32gerpp 1, 38, 34 +#endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 +#else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 32(CO) @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 37, 35 + xvf32gerpp 1, 36, 35 +#else xvf32gerpp 0, 37, 34 xvf32gerpp 1, 36, 34 +#endif + .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs8, vs9, save_permute_1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs8, vs0, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs0, vs8, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs8, vs0, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs0, vs8, 3 +#endif #endif stxv vs24, 0(CO) stxv vs26, 0(T1) @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) vspltisb v10, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 3, 35, 40 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 xxperm vs4, vs5, vs28 xxperm vs6, vs7, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs26, 32(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) + stxv vs6, 32(CO) + stxv vs4, 48(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) +#endif #endif addi CO, CO, 64 .endm @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, (32+\OffsetA)(AO) .endm @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 1, 35, 36 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs24, 0(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) +#endif #endif addi CO, CO, 32 .endm @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 +#else xxperm vs0, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs0 @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs37, vs1, save_permute_1 +#else xxperm vs37, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs36, vs36, vs37 diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c index 70b50809e..d6a91f079 100644 --- a/kernel/power/cscal_microk_power10.c +++ b/kernel/power/cscal_microk_power10.c @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) { __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif __asm__ ( "dcbt 0, %2 \n\t" diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index c2fde1c44..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "cswap_microk_power10.c" #elif defined(POWER10) -#include "cswap_microk_power8.c" +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 35390dd24..9ed0af767 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dasum_microk_power10.c" #elif defined(POWER10) -#include "dasum_microk_power8.c" +#include "dasum_microk_power10.c" #endif #endif - #ifndef HAVE_KERNEL_16 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index e47de2cb5..65743731e 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha +#else "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha +#endif "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha - +#endif "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda "add %10, %10, %10 \n\t" // 2 * lda +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha +#endif "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "one%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" +#endif "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" +#else "xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 37, 43, 35 \n\t" +#endif "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" +#else "xvmaddadp 36, 44, 32 \n\t" "xvmaddadp 37, 45, 32 \n\t" +#endif "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" +#else "xvmaddadp 36, 46, 33 \n\t" "xvmaddadp 37, 47, 33 \n\t" +#endif "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" +#else "xvmaddadp 36, 50, 48 \n\t" "xvmaddadp 37, 51, 48 \n\t" +#endif "lxvpx 50, %7, %11 \n\t" // a4[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" +#else "xvmaddadp 36, 52, 49 \n\t" "xvmaddadp 37, 53, 49 \n\t" +#endif "lxvpx 52, %8, %11 \n\t" // a5[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" +#else "xvmaddadp 36, 54, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t" +#endif "lxvpx 54, %9, %11 \n\t" // a6[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "lxvpx 56, %10, %11 \n\t" // a7[0] "addi %11, %11, 32 \n\t" @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "two%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 36, 42, 35 \n\t" @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "stxvp 36, 0( %2) \n\t" // y0, y1 : diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c index 3db4d5785..899b2a04b 100644 --- a/kernel/power/dgemv_t_power10.c +++ b/kernel/power/dgemv_t_power10.c @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvp 40, 32(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(42,34,35) + XXMRGLD_S(43,34,35) + XXMRGHD_S(44,4,5) + XXMRGLD_S(45,4,5) +#else XXMRGLD_S(42,35,34) XXMRGHD_S(43,35,34) XXMRGLD_S(44,5,4) XXMRGHD_S(45,5,4) +#endif "xvadddp 42,42,43 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(46,6,7) + XXMRGLD_S(47,6,7) +#else XXMRGLD_S(46,7,6) XXMRGHD_S(47,7,6) - +#endif "xvadddp 44,44,45 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(48,8,9) + XXMRGLD_S(49,8,9) +#else XXMRGLD_S(48,9,8) XXMRGHD_S(49,9,8) - +#endif "xvadddp 46,46,47 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 38,42,36 \n\t" + "xvmaddadp 39,44,36 \n\t" +#else "xvmaddadp 39,42,36 \n\t" "xvmaddadp 38,44,36 \n\t" - +#endif "xvadddp 48,48,49 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 41,48,36 \n\t" +#else "xvmaddadp 41,46,36 \n\t" - +#endif "stxvp 38, 0(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 40,46,36 \n\t" +#else "xvmaddadp 40,48,36 \n\t" +#endif "stxvp 40, 32(%[y]) \n\t" : [memy] "+m" (*(double (*)[8])y), diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 30c7411cc..2aa0b8055 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "drot_microk_power10.c" #elif defined(POWER10) -#include "drot_microk_power8.c" +#include "drot_microk_power10.c" #endif #endif @@ -117,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 32c39a8f4..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "dscal_microk_power10.c" #elif defined(POWER10) -#include "dscal_microk_power8.c" +#include "dscal_microk_power10.c" #endif #endif @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 12476965b..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "dswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 991d27508..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sasum_microk_power10.c" #elif defined(POWER10) -#include "sasum_microk_power8.c" +#include "sasum_microk_power10.c" #endif #endif @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 5a0d4b12e..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "srot_microk_power10.c" #elif defined(POWER10) -#include "srot_microk_power8.c" +#include "srot_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 9ae9ccab8..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "sscal_microk_power10.c" #elif defined(POWER10) -#include "sscal_microk_power8.c" +#include "sscal_microk_power10.c" #endif #endif @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 955ed02f0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) -#include "swap_microk_power10.c" #elif defined(POWER10) -#include "sswap_microk_power8.c" +#include "swap_microk_power10.c" #endif #endif @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) if ( n >= 64 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index 8e593bbfa..b03508b09 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, double alpha_r, double alpha_i) { #if !defined(CONJ) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + static const double mvec[2] = { -1.0, 1.0 }; +#else + static const double mvec[2] = { 1.0, -1.0 }; +#endif +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) static const double mvec[2] = { 1.0, -1.0 }; #else static const double mvec[2] = { -1.0, 1.0 }; +#endif #endif const double *mvecp = mvec; diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S index fca389e69..afee8f183 100644 --- a/kernel/power/zgemm_kernel_power10.S +++ b/kernel/power/zgemm_kernel_power10.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#if defined(linux) || defined(__FreeBSD__) +#if defined(linux) || defined(__FreeBSD__) || defined(_AIX) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S index 42f9c5ad4..e5e5ec0e6 100644 --- a/kernel/power/zgemm_macros_power10.S +++ b/kernel/power/zgemm_macros_power10.S @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#else xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif #endif .endm /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#endif .endm /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#endif .endm /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1,\VSIN1,\VSIN2 + xxmrgld \VSOUT2,\VSIN1,\VSIN2 +#else xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 +#endif .endm @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 #ifndef TRMMKERNEL lxv vs50, (\LOFFSET)(\BASE_REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd vs46,vs50,vs50 + xxmrgld vs47,vs50,vs50 +#else xxmrgld vs46,vs50,vs50 xxmrghd vs47,vs50,vs50 +#endif #endif RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 MULT_APLHA_PART1 vs34,vs36, vs46,vs47 MULT_APLHA_PART2 vs34,vs36, vs46,vs47 UNPACK_FOR_STORE vs46,vs47,vs39,vs41 +#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) xxmrghd vs39,vs47,vs46 +#endif stxv vs39, (\LOFFSET)(\BASE_REG) .endm @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs44, vs50 + xvf64gerpp 3, vs46, vs50 + xvf64gerpp 4, vs40, vs51 + xvf64gerpp 5, vs42, vs51 + xvf64gerpp 6, vs44, vs51 + xvf64gerpp 7, vs46, vs51 +#else xvf64gerpp 0, vs40, vs51 xvf64gerpp 1, vs42, vs51 xvf64gerpp 2, vs44, vs51 @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs42, vs50 xvf64gerpp 6, vs44, vs50 xvf64gerpp 7, vs46, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP4(\Index,64) @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x8 OffsetA,OffsetB +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif xxpermdi vs32, vs16, vs17, 0b01 xxpermdi vs33, vs16, vs17, 0b10 xxpermdi vs34, vs18, vs19, 0b01 @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs28, vs29, 0b10 xxpermdi vs46, vs30, vs31, 0b01 xxpermdi vs47, vs30, vs31, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs16, vs32, vs32 + xxlor vs17, vs33, vs33 + xxlor vs18, vs34, vs34 + xxlor vs19, vs35, vs35 + xxlor vs20, vs36, vs36 + xxlor vs21, vs37, vs37 + xxlor vs22, vs38, vs38 + xxlor vs23, vs39, vs39 + xxlor vs24, vs40, vs40 + xxlor vs25, vs41, vs41 + xxlor vs26, vs42, vs42 + xxlor vs27, vs43, vs43 + xxlor vs28, vs44, vs44 + xxlor vs29, vs45, vs45 + xxlor vs30, vs46, vs46 + xxlor vs31, vs47, vs47 +#else xxlor vs18, vs32, vs32 xxlor vs19, vs33, vs33 xxlor vs16, vs34, vs34 @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs31, vs45, vs45 xxlor vs28, vs46, vs46 xxlor vs29, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 addi CO, CO, 128 @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs42, vs51 - xvf64gerpp 2, vs40, vs50 - xvf64gerpp 3, vs42, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs40, vs51 + xvf64gerpp 3, vs42, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP4(\Index,64) @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x4 OffsetA, OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 addi CO, CO, 64 @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_2 Index, IsLast lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs40, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs40, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP4(\Index,64) @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x2 OffsetA,OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 SAVE2 vs4,vs5,vs6,vs7,T1,0 addi CO, CO, 32 @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs36, vs49 - xvf64gerpp 3, vs38, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 - xvf64gerpp 2, vs44, vs48 - xvf64gerpp 3, vs46, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 + xvf64gerpp 2, vs44, vs49 + xvf64gerpp 3, vs46, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP2(\Index,32) @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 addi CO, CO, 128 .endm @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP2(\Index,32) @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 addi CO, CO, 64 .endm @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 0, vs40, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 0, vs40, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP2(\Index,32) @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs33, vs0, vs1, 0b10 xxpermdi vs34, vs2, vs3, 0b01 xxpermdi vs35, vs2, vs3, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 xxlor vs1, vs35, vs35 +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 addi CO, CO, 32 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index d3bf60ca7..e42eafaba 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC -#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#if defined(POWER10) typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 59ddc149f..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif -#elif defined(POWER10) -#if defined(DOUBLE) -#include "zscal_microk_power8.c" -#endif #endif #endif diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c index 15b8323f4..af99b8648 100644 --- a/kernel/power/zscal_microk_power10.c +++ b/kernel/power/zscal_microk_power10.c @@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xsnegdp 33, %x10 \n\t" // -alpha_i XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i +#else XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i +#endif "lxvp 40, 0(%2) \n\t" "lxvp 42, 32(%2) \n\t" @@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 49, 49, 39 \n\t" "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" +#endif "xvadddp 34, 34, %x5 \n\t" @@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" @@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvadddp 50, 50, %x3 \n\t" "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else "stxv 49, 0(%2) \n\t" "stxv 48, 16(%2) \n\t" "stxv 51, 32(%2) \n\t" "stxv 50, 48(%2) \n\t" - +#endif "xvadddp 34, 34, %x5 \n\t" "xvadddp 35, 35, %x6 \n\t" "xvadddp 36, 36, %x7 \n\t" "xvadddp 37, 37, %x8 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else "stxv 35, 64(%2) \n\t" "stxv 34, 80(%2) \n\t" "stxv 37, 96(%2) \n\t" "stxv 36, 112(%2) \n\t" - +#endif "#n=%1 x=%0=%2 alpha=(%9,%10) \n" : "+m" (*x), diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 908802b71..fe7871852 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" -#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +#elif defined(POWER10) #include "cswap_microk_power10.c" -#elif defined(POWER10) -#include "zswap_microk_power8.c" #endif #endif diff --git a/param.h b/param.h index 48770fa7a..038233c19 100644 --- a/param.h +++ b/param.h @@ -2465,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 -#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -#define DGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_N 4 -#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 -#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 From 2be5ee3cca97a597f2ee2118808a2d5eacea050c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:17:21 +0200 Subject: [PATCH 472/681] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/clarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f index a45f55ac3..26a9febc8 100644 --- a/lapack-netlib/SRC/clarrv.f +++ b/lapack-netlib/SRC/clarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0) .OR. (M.LE.0) ) THEN RETURN END IF * From fe497efa0510466fd93578aaf9da1ad8ed4edbe7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:18:20 +0200 Subject: [PATCH 473/681] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/dlarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f index 4a59a2bbf..a1c6e9c9d 100644 --- a/lapack-netlib/SRC/dlarrv.f +++ b/lapack-netlib/SRC/dlarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From ddb0ff5353637bb5f5ad060c9620e334c143e3d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:19:07 +0200 Subject: [PATCH 474/681] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/slarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f index 04519fde8..9448b2fd9 100644 --- a/lapack-netlib/SRC/slarrv.f +++ b/lapack-netlib/SRC/slarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From 337b65133df174796794871b3988cd03426e6d41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Oct 2021 11:19:53 +0200 Subject: [PATCH 475/681] Fix out of bounds read in ?llarv (Reference-LAPACK PR 625) --- lapack-netlib/SRC/zlarrv.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f index 23976dbef..8d10e3c2e 100644 --- a/lapack-netlib/SRC/zlarrv.f +++ b/lapack-netlib/SRC/zlarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * From 9cc95e56579d865a3000c46e19c03455a3be3375 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Fri, 1 Oct 2021 05:18:35 -0500 Subject: [PATCH 476/681] AIX changes for P10 with GNU Compiler --- Makefile.system | 2 ++ kernel/power/KERNEL.POWER10 | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 20db80d07..1ed792756 100644 --- a/Makefile.system +++ b/Makefile.system @@ -16,6 +16,8 @@ else HOSTARCH = $(ARCH) endif +HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) + # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 50866c974..63816cb5f 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,4 +1,6 @@ - +ifeq ($(HAVE_GAS), 1) +include $(KERNELDIR)/KERNEL.POWER8 +else #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -216,4 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c - +endif From ad87d627487a2647ee782b3948ceeba8733bee68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 16:27:34 +0200 Subject: [PATCH 477/681] Update Alpine version --- azure-pipelines.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5d4a1ecd3..f9e79018b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -206,8 +206,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ + && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 From 5a468ae87a44f4eee356d629d0826bed0a5a5f46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 19:25:58 +0200 Subject: [PATCH 478/681] Update Changelog for 0.3.18 (#3388) * Update Changelog for 0.3.18 --- Changelog.txt | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index ee0484e2b..59fe1d45e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,47 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.18 + 02-Oct-2021 + +general: + - when the build-time number of preconfigured threads is exceeded + at runtime (typically by an external program calling BLAS functions + from a larger number of threads in parallel), OpenBLAS will now + allocate an auxiliary control structure for up to 512 additional + threads instead of aborting + - added support for Loongson's LoongArch64 cpu architecture + - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON + - added support for building OpenBLAS as a CMAKE subproject + - added support for building for Windows/ARM64 targets with clang + - improved support for building with the IBM xlf compiler + - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) + - imported Reference-LAPACK PR 597 for testsuite compatibility with + LLVM's libomp + +x86_64: + - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) + - added optimized SBGEMM for Intel Cooper Lake + - reinstated the performance patch for AVX512 SGEMV_T with a proper fix + - added a workaround for a gcc11 tree-vectorizer bug that caused spurious + failures in the test programs for complex BLAS3 when compiling at -O3 + (the default for cmake "release" builds) + - added support for runtime cpu count detection under Haiku OS + - worked around a long-standing miscompilation issue of the Haswell DGEMV_T + kernel with gcc that could produce NaN output in some corner cases + +POWER: + - improved performance of DASUM on POWER10 + +ARMV8: + - fixed crashes (use of reserved register x18) on Apple M1 under OSX + - fixed building with gcc releases earlier than 5.1 + +MIPS: + - fixed building under BSD + +MIPS64: + - fixed building under BSD + ==================================================================== Version 0.3.17 15-Jul-2021 From 686e1f0052d52fa58090f179ee733aa3ffd7cc85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 19:29:59 +0200 Subject: [PATCH 479/681] Update version to 0.3.18 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37191a42b..1ea2e551c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 17) +set(OpenBLAS_PATCH_VERSION 18) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From efe42481e2c85e8197bdf533078bf89bdf6eabf4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 19:38:09 +0200 Subject: [PATCH 480/681] Update version to 0.3.18 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 7c04a3101..57dab1152 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.17.dev +VERSION = 0.3.18 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 32b4d01d1630d3c484eea8968748f64ecf2c804b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 21:15:00 +0200 Subject: [PATCH 481/681] Update version to 0.3.18.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e63f7e04c..a18a7adc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 18) +set(OpenBLAS_PATCH_VERSION 18.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From dcb005351e84ff4a37339868ea5817d7a8a29ddc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Oct 2021 21:15:39 +0200 Subject: [PATCH 482/681] Update version to 0.3.18.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 57dab1152..500b7c44f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.18 +VERSION = 0.3.18.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 04f3ecd02637f610ae996f7eb25ee284608071f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Oct 2021 16:14:32 +0200 Subject: [PATCH 483/681] Fix minor typo --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 150dbef50..9bb497459 100644 --- a/Makefile.system +++ b/Makefile.system @@ -307,7 +307,7 @@ else SMP = 1 endif else -ifeq ($(NUM_THREAD), 1) +ifeq ($(NUM_THREADS), 1) SMP = else SMP = 1 From 1cce778585d6aed9b272ae975a45196279648e24 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Oct 2021 16:46:41 +0200 Subject: [PATCH 484/681] Fix detection of Apple M1 "Vortex" --- cpuid_arm64.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 2a9399f7d..430429cd3 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,7 +26,7 @@ *****************************************************************************/ #include -#ifdef OS_DARWIN +#ifdef __APPLE__ #include int32_t value; size_t length=sizeof(value); @@ -212,9 +212,9 @@ int detect(void) } #else -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -265,7 +265,7 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); #endif @@ -420,7 +420,7 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; -#ifdef DARWIN +#ifdef __APPLE__ case CPU_VORTEX: printf("#define VORTEX \n"); sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); @@ -431,6 +431,8 @@ void get_cpuconfig(void) printf("#define L1_DATA_SIZE %d \n",value); sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); printf("#define L2_SIZE %d \n",value); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); break; #endif } From d7351deccfcc617db0b8caa687b5e10f0defee7d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Oct 2021 17:58:29 +0200 Subject: [PATCH 485/681] Fix cache reporting for Apple M1 --- cpuid_arm64.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 430429cd3..73a82d188 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -30,6 +30,8 @@ #include int32_t value; size_t length=sizeof(value); +int64_t value64; +size_t length64=sizeof(value64); #endif #define CPU_UNKNOWN 0 @@ -423,14 +425,14 @@ void get_cpuconfig(void) #ifdef __APPLE__ case CPU_VORTEX: printf("#define VORTEX \n"); - sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); - printf("#define L1_CODE_SIZE %d \n",value); - sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); - printf("#define L1_CODE_LINESIZE %d \n",value); - sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); - printf("#define L1_DATA_SIZE %d \n",value); - sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); - printf("#define L2_SIZE %d \n",value); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; From 8c20ca345aad43c2f74a72b356afdbc1ec368e31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Oct 2021 11:06:43 +0200 Subject: [PATCH 486/681] Use Neoverse's current mix of ThunderX2 kernels for Vortex as well --- kernel/arm64/KERNEL.VORTEX | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.VORTEX b/kernel/arm64/KERNEL.VORTEX index e3efef1f5..46a34469c 100644 --- a/kernel/arm64/KERNEL.VORTEX +++ b/kernel/arm64/KERNEL.VORTEX @@ -1 +1 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +include $(KERNELDIR)/KERNEL.NEOVERSEN1 From 24233b7c49bafb4c93ae2300ae9633bd2e2dd3b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Oct 2021 11:10:19 +0200 Subject: [PATCH 487/681] Use "big arm server" GEMM defaults for Vortex --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 038233c19..e8e49ce43 100644 --- a/param.h +++ b/param.h @@ -2972,7 +2972,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2989,7 +2989,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 From 8a87e80c742a146d3900d64046d4f4c7fd58b6b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Oct 2021 23:24:52 +0200 Subject: [PATCH 488/681] Update conda in Appveyor CI and move jobs from Appveyor to Azure (#3400) * Fix clang/cl builds on Appveyor and move them to Azure * Add clang/flang and mingw builds on Windows to Azure --- appveyor.yml | 23 ++++++++++++----------- azure-pipelines.yml | 45 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index d575c5b7f..96a967387 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,15 +29,15 @@ environment: global: CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 matrix: - - COMPILER: clang-cl - WITH_FORTRAN: ON - - COMPILER: clang-cl - DYNAMIC_ARCH: ON - WITH_FORTRAN: OFF - - COMPILER: cl - - COMPILER: MinGW64-gcc-7.2.0-mingw - DYNAMIC_ARCH: OFF - WITH_FORTRAN: ignore +# - COMPILER: clang-cl +# WITH_FORTRAN: ON +# - COMPILER: clang-cl +# DYNAMIC_ARCH: ON +# WITH_FORTRAN: OFF +# - COMPILER: cl +# - COMPILER: MinGW64-gcc-7.2.0-mingw +# DYNAMIC_ARCH: OFF +# WITH_FORTRAN: ignore - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 COMPILER: MinGW-gcc-6.3.0-32 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 @@ -46,6 +46,7 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat + - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 @@ -64,8 +65,8 @@ before_build: - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f9e79018b..286a620ba 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -75,7 +75,50 @@ jobs: cd utest dir openblas_utest.exe - + +- job: Windows_mingw_gmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" + +- job: Windows_clang_cmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH% + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes ninja + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + mkdir build + cd build + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + +- job: Windows_flang_clang + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH%" + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes --quiet ninja flang + mkdir build + cd build + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + - job: OSX_OpenMP pool: vmImage: 'macOS-10.15' From 3dc6052c7edb7b664f03020359c391d9897e237b Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 3 Sep 2021 00:39:50 -0700 Subject: [PATCH 489/681] initial support for Sapphire Rapids platform --- Makefile.system | 6 ++ Makefile.x86_64 | 24 +++++ TargetList.txt | 1 + cmake/cc.cmake | 13 +++ cmake/system.cmake | 18 +++- cpuid.h | 3 + cpuid_x86.c | 28 ++++++ driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- driver/level3/trmm_L.c | 8 +- driver/level3/trmm_R.c | 12 +-- driver/others/parameter.c | 4 +- getarch.c | 30 ++++++ kernel/CMakeLists.txt | 2 +- kernel/Makefile | 17 +++- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 1 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sbdot.c | 2 +- kernel/x86_64/sbgemv_n.c | 2 +- kernel/x86_64/sbgemv_t.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemm_direct_skylakex.c | 2 +- kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/tobf16.c | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 119 ++++++++++++++++++++++++ 66 files changed, 325 insertions(+), 65 deletions(-) create mode 100644 kernel/x86_64/KERNEL.SAPPHIRERAPIDS diff --git a/Makefile.system b/Makefile.system index bd2164d02..833511fad 100644 --- a/Makefile.system +++ b/Makefile.system @@ -121,6 +121,9 @@ endif ifeq ($(TARGET), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -166,6 +169,9 @@ endif ifeq ($(TARGET_CORE), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 307cbe1d9..15cf202c0 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -96,6 +96,30 @@ endif endif endif +ifeq ($(CORE), SAPPHIRERAPIDS) +ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# sapphire rapids support was added in 11 +ifeq ($(GCCVERSIONGTEQ11), 1) +CCOMMON_OPT += -march=sapphirerapids +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=sapphirerapids +endif +endif +endif +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif + ifdef HAVE_AVX2 ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) diff --git a/TargetList.txt b/TargetList.txt index 963545cdd..b02a011d5 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -23,6 +23,7 @@ HASWELL SKYLAKEX ATOM COOPERLAKE +SAPPHIRERAPIDS b)AMD CPU: ATHLON diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 1794b5e5b..0ab1d4c1b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -126,6 +126,19 @@ if (${CORE} STREQUAL COOPERLAKE) endif () endif () +if (${CORE} STREQUAL SAPPHIRERAPIDS) + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + endif() + endif () + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/cmake/system.cmake b/cmake/system.cmake index f56ded966..bcca91c25 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") @@ -163,6 +163,22 @@ if (DEFINED TARGET) endif() endif() endif() + if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() + endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() diff --git a/cpuid.h b/cpuid.h index 2c43922e7..55478893c 100644 --- a/cpuid.h +++ b/cpuid.h @@ -120,6 +120,7 @@ #define CORE_SKYLAKEX 28 #define CORE_DHYANA 29 #define CORE_COOPERLAKE 30 +#define CORE_SAPPHIRERAPIDS 31 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -145,6 +146,7 @@ #define HAVE_AVX512VL (1 << 21) #define HAVE_AVX2 (1 << 22) #define HAVE_AVX512BF16 (1 << 23) +#define HAVE_AMXBF16 (1 << 24) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -222,6 +224,7 @@ typedef struct { #define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_DHYANA 53 #define CPUTYPE_COOPERLAKE 54 +#define CPUTYPE_SAPPHIRERAPIDS 55 #define CPUTYPE_HYGON_UNKNOWN 99 diff --git a/cpuid_x86.c b/cpuid_x86.c index 5aa49055a..bb9d779bd 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -266,6 +266,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -353,6 +378,7 @@ int get_cputype(int gettype){ if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; + if (support_amx_bf16()) feature |= HAVE_AMXBF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -2389,6 +2415,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); + if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2460,6 +2487,7 @@ void get_sse(void){ if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); + if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 9b44deb85..4a8e193be 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 2b33c9589..dfc7107b8 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 880de4df4..e25ea7afe 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 3be43edde..ab9cdfae8 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 791e5dc27..0d5c6aec0 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -183,7 +183,7 @@ int get_L2_size(void){ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ - defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -269,7 +269,7 @@ void blas_set_parameter(void){ int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ - defined(SKYLAKEX) || defined(COOPERLAKE) + defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 094feaadd..d64ec3757 100644 --- a/getarch.c +++ b/getarch.c @@ -469,6 +469,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_SAPPHIRERAPIDS +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SAPPHIRERAPIDS" +#define ARCHCONFIG "-DSAPPHIRERAPIDS " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids" +#define LIBNAME "sapphirerapids" +#define CORENAME "SAPPHIRERAPIDS" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9ffbd944f..9c8460723 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -198,7 +198,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) - if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) set(USE_TRMM true) endif () if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) diff --git a/kernel/Makefile b/kernel/Makefile index 1a6c9413f..cbe4cde6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -31,7 +31,22 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), COOPERLAKE) +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10), 1) + override CFLAGS += -march=sapphirerapids + else + override CFLAGS += -march=skylake-avx512 -mavx512f + endif + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d274d33b..89691ef6f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -47,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE) USE_TRMM = 1 endif +ifeq ($(CORE), SAPPHIRERAPIDS) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 19b7b5f0b..fe796be64 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1518,7 +1518,7 @@ static void init_parameter(void) { #endif #endif -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index fde9eba8e..0d71201d6 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index fddf7560f..e775b4d76 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 33afd2a61..d3d110811 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index b05bd6ee5..e56a768db 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index f960559a6..85a29ce57 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index cf842c9b5..5c128d7a4 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 63c44c27a..73174e424 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 4cb01e50a..ebe83ff40 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 09d5d8e43..b26ffb473 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 7d129e54c..c2c7caadc 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index d33599317..42526135c 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS new file mode 100644 index 000000000..61965c745 --- /dev/null +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.COOPERLAKE diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index c19b98f02..7270a98bc 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 654cd351a..264776239 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 0ed02b8d8..3ca173c20 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index c2903b11f..3187e196c 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 6d75358a6..dc3f688c6 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 26437012c..2796b8270 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index e4b6622e6..5d0c32234 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index da68db0cd..f883d4f26 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a3bf28dc8..9688c6bf3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d1270d20b..05c5c7f16 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 573377ee0..590776005 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 530ac8b1d..f196aa364 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 7b2845636..ff911c52b 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c index ef14fd618..a4e60b7c4 100644 --- a/kernel/x86_64/sbdot.c +++ b/kernel/x86_64/sbdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "sbdot_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index 18e64dc3f..08ccace61 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_n_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c index 22b099116..51ea0d937 100644 --- a/kernel/x86_64/sbgemv_t.c +++ b/kernel/x86_64/sbgemv_t.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_t_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index e816c67e9..a0acea9d1 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 2588289d1..badeb0fbf 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,7 @@ /* the direct sgemm code written by Arjan van der Ven */ #include "common.h" -#if defined(SKYLAKEX) || defined (COOPERLAKE) +#if defined(SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index e0778006f..621ddc622 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_sandy-4.c" #elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sgemv_n_microk_haswell-4.c" #include "sgemv_n_microk_skylakex-8.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index a36c8ace9..0be2c7e97 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_sandy-4.c" #elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sgemv_t_microk_haswell-4.c" #include "sgemv_t_microk_skylakex.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index c9d698eb7..29d6a9958 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 4d8aac1ab..02bbc1c64 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index fea4fc746..55780734f 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index b853ef365..77331d95f 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index bad367e91..b61182303 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 147201751..99bc07d50 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c index 3d1796621..a88fdcc2e 100644 --- a/kernel/x86_64/tobf16.c +++ b/kernel/x86_64/tobf16.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #if defined(DOUBLE) #include "dtobf16_microk_cooperlake.c" #elif defined(SINGLE) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 25e9f6d42..8786870bd 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 1bc785ac1..50c8a2678 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 1f9d41859..2d6866a78 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 34f28b224..c2791e0f3 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 09a702a81..3744c98bb 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 83ed41ba1..df190c64c 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 7ed2faf0f..cba167f4d 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 5945f3f81..13176ce9c 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 484d74f14..1657885c0 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index e8e49ce43..4e83714d1 100644 --- a/param.h +++ b/param.h @@ -1751,6 +1751,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SAPPHIRERAPIDS + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif + #ifdef COOPERLAKE #define SNUMOPT 16 From 4280dff103240ca1aed341eb7b8a2b6de80f978d Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 12 Oct 2021 01:39:09 -0700 Subject: [PATCH 490/681] Add NO_AVX=1 fallbacks to Sapphire Rapids build --- getarch.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index d64ec3757..d095472a6 100644 --- a/getarch.c +++ b/getarch.c @@ -470,10 +470,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SAPPHIRERAPIDS -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -483,10 +504,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "SAPPHIRERAPIDS" #define ARCHCONFIG "-DSAPPHIRERAPIDS " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ From 1eca91f3151827813e9491ca7ff2c2fa23af75ee Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 12 Oct 2021 02:01:20 -0700 Subject: [PATCH 491/681] Fix build error in legacy gcc --- Makefile.x86_64 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 15cf202c0..f14a8a8ff 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -81,6 +81,11 @@ CCOMMON_OPT += -march=cooperlake ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=cooperlake endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif endif endif ifeq ($(OSNAME), CYGWIN_NT) @@ -105,6 +110,11 @@ CCOMMON_OPT += -march=sapphirerapids ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=sapphirerapids endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif endif endif ifeq ($(OSNAME), CYGWIN_NT) From efd7ac241dc7de94b3a6e660599df795e2320828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=85=D9=87=D8=AF=D9=8A=20=D8=B4=D9=8A=D9=86=D9=88=D9=86?= =?UTF-8?q?=20=28Mehdi=20Chinoune=29?= <79349457+MehdiChinoune@users.noreply.github.com> Date: Sat, 16 Oct 2021 07:55:10 +0100 Subject: [PATCH 492/681] Fix MinGW/Clang 64 bits detection. CMAKE_COMPILER_IS_GNUCC is only valid for GCC. --- cmake/system_check.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 8d0558c0e..72c48db37 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -20,11 +20,11 @@ endif() -if(CMAKE_COMPILER_IS_GNUCC AND WIN32) +if(MINGW) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE + OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") set(MINGW64 1) endif() endif() From 28a77a8698fbe7724ac6d18d7301812975a94b30 Mon Sep 17 00:00:00 2001 From: Mehdi Chinoune Date: Sat, 16 Oct 2021 08:33:47 +0100 Subject: [PATCH 493/681] Support building both static and shared libraries --- CMakeLists.txt | 89 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a18a7adc3..a7b5569bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,14 @@ set(NO_AFFINITY 1) endif() option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_STATIC_LIBS "Build static library" OFF) +if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) + set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) +endif() +if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) + message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -183,12 +191,45 @@ if (${DYNAMIC_ARCH}) endif () # add objects to the openblas lib -add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +if(NOT NO_LAPACK) + add_library(LAPACK OBJECT ${LA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(NOT NO_LAPACKE) + add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(BUILD_RELAPACK) + add_library(RELAPACK OBJECT ${RELA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +set(OpenBLAS_LIBS "") +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) +endif() +if(BUILD_SHARED_LIBS) + add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) +endif() +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) +else() + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) +endif() + +set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) # Android needs to explicitly link against libm if(ANDROID) - target_link_libraries(${OpenBLAS_LIBNAME} m) + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static m) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared m) + endif() endif() # Handle MSVC exports @@ -197,21 +238,21 @@ if(MSVC AND BUILD_SHARED_LIBS) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) - set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) + set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) endif() endif() # Set output for libopenblas -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) endforeach() enable_testing() @@ -220,10 +261,17 @@ if (USE_THREAD) # Add threading library to linker find_package(Threads) if (THREADS_HAVE_PTHREAD_ARG) - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") + set_target_properties(${OpenBLAS_LIBS} PROPERTIES + COMPILE_OPTIONS "-pthread" + INTERFACE_COMPILE_OPTIONS "-pthread" + ) + endif() + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) endif() - target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() #if (MSVC OR NOT NOFORTRAN) @@ -244,14 +292,14 @@ if (NOT NOFORTRAN) endif() endif() -set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES +set_target_properties(${OpenBLAS_LIBS} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) - target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") else() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() @@ -314,7 +362,7 @@ endif() if (NOT ${SYMBOLSUFFIX} STREQUAL "") message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") endif() - add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" @@ -325,7 +373,7 @@ endif() # Install project # Install libraries -install(TARGETS ${OpenBLAS_LIBNAME} +install(TARGETS ${OpenBLAS_LIBS} EXPORT "OpenBLAS${SUFFIX64}Targets" RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -387,7 +435,12 @@ endif() if(NOT NO_LAPACKE) message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") - add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) + if(BUILD_STATIC_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) + endif() + if(BUILD_SHARED_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) + endif() FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) From 556788281dd0da359c93ab5d3832023cf26e50a4 Mon Sep 17 00:00:00 2001 From: Mehdi Chinoune Date: Sun, 17 Oct 2021 05:19:30 +0100 Subject: [PATCH 494/681] [NFC] Improve CMakeLists.txt file readibility Add some extra lines and indentations. --- CMakeLists.txt | 228 ++++++++++++++++++++++++++----------------------- 1 file changed, 120 insertions(+), 108 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a7b5569bb..656cc36f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,9 @@ ## cmake_minimum_required(VERSION 2.8.5) + project(OpenBLAS C ASM) + set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_PATCH_VERSION 18.dev) @@ -20,19 +22,27 @@ endif() ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) + option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() + option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) + option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) + option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) + option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") -option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) + option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() -set(NO_AFFINITY 1) + set(NO_AFFINITY 1) endif() + option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) + option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) option(BUILD_STATIC_LIBS "Build static library" OFF) if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) @@ -46,33 +56,34 @@ endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using # 64 bit integer interfaces in OpenBLAS. - set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) + set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) + ####### if(BUILD_WITHOUT_LAPACK) -set(NO_LAPACK 1) -set(NO_LAPACKE 1) + set(NO_LAPACK 1) + set(NO_LAPACKE 1) endif() if(BUILD_WITHOUT_CBLAS) -set(NO_CBLAS 1) + set(NO_CBLAS 1) endif() ####### if(MSVC AND MSVC_STATIC_CRT) - set(CompilerFlags - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_FLAGS - CMAKE_C_FLAGS_DEBUG - CMAKE_C_FLAGS_RELEASE - ) - foreach(CompilerFlag ${CompilerFlags}) - string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") - endforeach() + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -106,7 +117,7 @@ endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_BFLOAT16 true) + # set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -151,9 +162,10 @@ endif () set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) if(MSVC) -set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) -set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) + set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) + set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) endif () + # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (SUBDIR ${SUBDIRS}) @@ -287,9 +299,9 @@ if (NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBS} PROPERTIES @@ -301,72 +313,73 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") else() - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() endif() if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") -if (NOT DEFINED ARCH) - set(ARCH_IN "x86_64") -else() - set(ARCH_IN ${ARCH}) -endif() + if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") + else() + set(ARCH_IN ${ARCH}) + endif() -if (${CORE} STREQUAL "generic") - set(ARCH_IN "GENERIC") -endif () + if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") + endif () -if (NOT DEFINED EXPRECISION) - set(EXPRECISION_IN 0) -else() - set(EXPRECISION_IN ${EXPRECISION}) -endif() + if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) + else() + set(EXPRECISION_IN ${EXPRECISION}) + endif() -if (NOT DEFINED NO_CBLAS) - set(NO_CBLAS_IN 0) -else() - set(NO_CBLAS_IN ${NO_CBLAS}) -endif() + if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) + else() + set(NO_CBLAS_IN ${NO_CBLAS}) + endif() -if (NOT DEFINED NO_LAPACK) - set(NO_LAPACK_IN 0) -else() - set(NO_LAPACK_IN ${NO_LAPACK}) -endif() + if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) + else() + set(NO_LAPACK_IN ${NO_LAPACK}) + endif() -if (NOT DEFINED NO_LAPACKE) - set(NO_LAPACKE_IN 0) -else() - set(NO_LAPACKE_IN ${NO_LAPACKE}) -endif() + if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) + else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) + endif() -if (NOT DEFINED NEED2UNDERSCORES) - set(NEED2UNDERSCORES_IN 0) -else() - set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) -endif() + if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) + else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) + endif() -if (NOT DEFINED ONLY_CBLAS) - set(ONLY_CBLAS_IN 0) -else() - set(ONLY_CBLAS_IN ${ONLY_CBLAS}) -endif() + if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) + else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) + endif() -if (NOT DEFINED BU) - set(BU _) -endif() + if (NOT DEFINED BU) + set(BU _) + endif() -if (NOT ${SYMBOLPREFIX} STREQUAL "") -message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() -if (NOT ${SYMBOLSUFFIX} STREQUAL "") -message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() - add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so - COMMENT "renaming symbols" - ) + if (NOT ${SYMBOLPREFIX} STREQUAL "") + message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) endif() @@ -374,9 +387,9 @@ endif() # Install libraries install(TARGETS ${OpenBLAS_LIBS} - EXPORT "OpenBLAS${SUFFIX64}Targets" - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) # Install headers @@ -413,41 +426,41 @@ if(NOT NOFORTRAN) endif() if(NOT NO_CBLAS) - message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) - string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - if (NOT ${SYMBOLPREFIX} STREQUAL "") - string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - if (NOT ${SYMBOLSUFFIX} STREQUAL "") - string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) + string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) - message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") + message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") if(BUILD_STATIC_LIBS) - add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) + add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) endif() if(BUILD_SHARED_LIBS) - add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) + add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) endif() - FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") - install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") + install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - ADD_CUSTOM_TARGET(genlapacke - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" - ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + ADD_CUSTOM_TARGET(genlapacke + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" + ) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() # Install pkg-config files @@ -472,4 +485,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - From b57acdf2d3c0d71fa457f6e70662f77525a38a68 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 18 Oct 2021 00:26:14 +0200 Subject: [PATCH 495/681] Add march/mtune flags for clang builds on ARM64 as well (#3414) * Add march/mtune flags for clang as well --- Makefile.arm64 | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 2656a17f9..3e3466de8 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,6 +1,9 @@ ifneq ($(C_COMPILER), PGI) -ifneq ($(GCCVERSIONGT4), 1) +ifeq ($(C_COMPILER), CLANG) +ISCLANG=1 +endif +ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) CCOMMON_OPT += -march=armv8-a ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a @@ -48,7 +51,7 @@ endif # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) -ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 ifneq ($(F_COMPILER), NAG) @@ -70,7 +73,7 @@ endif # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) -ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ8), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 ifneq ($(F_COMPILER), NAG) @@ -132,7 +135,7 @@ FCOMMON_OPT += -march=armv8.3-a endif endif -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 ifneq ($(F_COMPILER), NAG) From 1d48b7cb168c57285e8c3763b0e0acc204ddbb17 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 6 Sep 2021 19:48:23 -0700 Subject: [PATCH 496/681] sbgemm: spr: add dummy source files --- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 11 +++++++++ kernel/x86_64/sbgemm_incopy_16_spr.c | 32 ++++++++++++++++++++++++ kernel/x86_64/sbgemm_itcopy_16_spr.c | 32 ++++++++++++++++++++++++ kernel/x86_64/sbgemm_kernel_16x16_spr.c | 33 +++++++++++++++++++++++++ kernel/x86_64/sbgemm_oncopy_16_spr.c | 32 ++++++++++++++++++++++++ kernel/x86_64/sbgemm_otcopy_16_spr.c | 32 ++++++++++++++++++++++++ 6 files changed, 172 insertions(+) create mode 100644 kernel/x86_64/sbgemm_incopy_16_spr.c create mode 100644 kernel/x86_64/sbgemm_itcopy_16_spr.c create mode 100644 kernel/x86_64/sbgemm_kernel_16x16_spr.c create mode 100644 kernel/x86_64/sbgemm_oncopy_16_spr.c create mode 100644 kernel/x86_64/sbgemm_otcopy_16_spr.c diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 61965c745..bee624b04 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1 +1,12 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE + +SBGEMM_BETA = sgemm_beta_skylakex.c +SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c +SBGEMMINCOPY = sbgemm_incopy_16_spr.c +SBGEMMITCOPY = sbgemm_itcopy_16_spr.c +SBGEMMONCOPY = sbgemm_oncopy_16_spr.c +SBGEMMOTCOPY = sbgemm_otcopy_16_spr.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_incopy_16_spr.c b/kernel/x86_64/sbgemm_incopy_16_spr.c new file mode 100644 index 000000000..2f57ae7b6 --- /dev/null +++ b/kernel/x86_64/sbgemm_incopy_16_spr.c @@ -0,0 +1,32 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + return 0; +} diff --git a/kernel/x86_64/sbgemm_itcopy_16_spr.c b/kernel/x86_64/sbgemm_itcopy_16_spr.c new file mode 100644 index 000000000..2f57ae7b6 --- /dev/null +++ b/kernel/x86_64/sbgemm_itcopy_16_spr.c @@ -0,0 +1,32 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + return 0; +} diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c new file mode 100644 index 000000000..51f44ba4a --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -0,0 +1,33 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + return 0; +} diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c new file mode 100644 index 000000000..2f57ae7b6 --- /dev/null +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -0,0 +1,32 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + return 0; +} diff --git a/kernel/x86_64/sbgemm_otcopy_16_spr.c b/kernel/x86_64/sbgemm_otcopy_16_spr.c new file mode 100644 index 000000000..2f57ae7b6 --- /dev/null +++ b/kernel/x86_64/sbgemm_otcopy_16_spr.c @@ -0,0 +1,32 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + return 0; +} From d0b253ac6eb639cdcce1c70d28d64b23e1fbaa92 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 8 Sep 2021 19:41:12 -0700 Subject: [PATCH 497/681] sbgemm: spr: implement oncopy_16 --- kernel/x86_64/sbgemm_oncopy_16_spr.c | 145 +++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c index 2f57ae7b6..da353d2c7 100644 --- a/kernel/x86_64/sbgemm_oncopy_16_spr.c +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -25,8 +25,153 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ +#include #include "common.h" +#define COPY_32(N) _mm512_storeu_si512(boffset + 32 * N, _mm512_loadu_si512(aoffset##N + i)) +#define MASK_COPY_32(N) _mm512_mask_storeu_epi16(boffset + tail_m * N, mmask, _mm512_maskz_loadu_epi16(mmask, aoffset##N + i)) +#define COPY_ODD_TAIL(N) *(boffset + N) = *(aoffset##N + i); + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + printf("ONCOPY: m %d, n %d, lda %d\n", m, n, lda); + BLASLONG i, j; + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; + IFLOAT *aoffset4, *aoffset5, *aoffset6, *aoffset7; + IFLOAT *aoffset8, *aoffset9, *aoffset10, *aoffset11; + IFLOAT *aoffset12, *aoffset13, *aoffset14, *aoffset15; + + aoffset = a; + boffset = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + BLASLONG m2 = m & ~1; + + for (j = 0; j < n16; j += 16) { + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + COPY_32(0); COPY_32(1); COPY_32(2); COPY_32(3); + COPY_32(4); COPY_32(5); COPY_32(6); COPY_32(7); + COPY_32(8); COPY_32(9); COPY_32(10); COPY_32(11); + COPY_32(12); COPY_32(13); COPY_32(14); COPY_32(15); + boffset += 32 * 16; + } + if (i < m2) { + int tail_m = m2 - i; + __mmask32 mmask = (1UL << tail_m) - 1; + MASK_COPY_32(0); MASK_COPY_32(1); MASK_COPY_32(2); MASK_COPY_32(3); + MASK_COPY_32(4); MASK_COPY_32(5); MASK_COPY_32(6); MASK_COPY_32(7); + MASK_COPY_32(8); MASK_COPY_32(9); MASK_COPY_32(10); MASK_COPY_32(11); + MASK_COPY_32(12); MASK_COPY_32(13); MASK_COPY_32(14); MASK_COPY_32(15); + i = m2; + boffset += tail_m * 16; + } + if (i < m) { + /* the tail odd k should put alone */ + COPY_ODD_TAIL(0); COPY_ODD_TAIL(1); COPY_ODD_TAIL(2); COPY_ODD_TAIL(3); + COPY_ODD_TAIL(4); COPY_ODD_TAIL(5); COPY_ODD_TAIL(6); COPY_ODD_TAIL(7); + COPY_ODD_TAIL(8); COPY_ODD_TAIL(9); COPY_ODD_TAIL(10); COPY_ODD_TAIL(11); + COPY_ODD_TAIL(12); COPY_ODD_TAIL(13); COPY_ODD_TAIL(14); COPY_ODD_TAIL(15); + boffset += 16; + } + } + if (j < n) { + int remain_n = n - j; + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + for (i = 0; i < m32; i += 32) { + switch(remain_n) { + case 15: COPY_32(14); + case 14: COPY_32(13); + case 13: COPY_32(12); + case 12: COPY_32(11); + case 11: COPY_32(10); + case 10: COPY_32(9); + case 9: COPY_32(8); + case 8: COPY_32(7); + case 7: COPY_32(6); + case 6: COPY_32(5); + case 5: COPY_32(4); + case 4: COPY_32(3); + case 3: COPY_32(2); + case 2: COPY_32(1); + case 1: COPY_32(0); + } + boffset += 32 * remain_n; + } + if (i < m2) { + int tail_m = m2 - i; + __mmask32 mmask = (1UL << tail_m) - 1; + switch(remain_n) { + case 15: MASK_COPY_32(14); + case 14: MASK_COPY_32(13); + case 13: MASK_COPY_32(12); + case 12: MASK_COPY_32(11); + case 11: MASK_COPY_32(10); + case 10: MASK_COPY_32(9); + case 9: MASK_COPY_32(8); + case 8: MASK_COPY_32(7); + case 7: MASK_COPY_32(6); + case 6: MASK_COPY_32(5); + case 5: MASK_COPY_32(4); + case 4: MASK_COPY_32(3); + case 3: MASK_COPY_32(2); + case 2: MASK_COPY_32(1); + case 1: MASK_COPY_32(0); + } + i = m2; + boffset += tail_m * remain_n; + } + if (i < m) { + switch(remain_n) { + case 15: COPY_ODD_TAIL(14); + case 14: COPY_ODD_TAIL(13); + case 13: COPY_ODD_TAIL(12); + case 12: COPY_ODD_TAIL(11); + case 11: COPY_ODD_TAIL(10); + case 10: COPY_ODD_TAIL(9); + case 9: COPY_ODD_TAIL(8); + case 8: COPY_ODD_TAIL(7); + case 7: COPY_ODD_TAIL(6); + case 6: COPY_ODD_TAIL(5); + case 5: COPY_ODD_TAIL(4); + case 4: COPY_ODD_TAIL(3); + case 3: COPY_ODD_TAIL(2); + case 2: COPY_ODD_TAIL(1); + case 1: COPY_ODD_TAIL(0); + } + } + } return 0; } From 6051c867418bb0ddc762647155637365c5a92a24 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 10 Sep 2021 01:14:05 -0700 Subject: [PATCH 498/681] sbgemm: spr: kernel works for m32 in NN case --- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 +- kernel/x86_64/sbgemm_itcopy_16_spr.c | 32 --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 252 +++++++++++++++++++++++- 3 files changed, 252 insertions(+), 34 deletions(-) delete mode 100644 kernel/x86_64/sbgemm_itcopy_16_spr.c diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index bee624b04..3f67640cb 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -3,7 +3,7 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c SBGEMMINCOPY = sbgemm_incopy_16_spr.c -SBGEMMITCOPY = sbgemm_itcopy_16_spr.c +SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c SBGEMMONCOPY = sbgemm_oncopy_16_spr.c SBGEMMOTCOPY = sbgemm_otcopy_16_spr.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/x86_64/sbgemm_itcopy_16_spr.c b/kernel/x86_64/sbgemm_itcopy_16_spr.c deleted file mode 100644 index 2f57ae7b6..000000000 --- a/kernel/x86_64/sbgemm_itcopy_16_spr.c +++ /dev/null @@ -1,32 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2021, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE - * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - return 0; -} diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index 51f44ba4a..41d2634d5 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -25,9 +25,259 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ +#include +#include #include "common.h" -int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +/* tile0/tile1 -- A (m x 2k) + * tile2/tile3 -- B (2k x n) + * tile4-7 -- C (m x n) + */ +#define TCONF(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = k2>>1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_rows[5] = m; \ + cfg.tile_rows[6] = m; \ + cfg.tile_rows[7] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = k2<<1; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + cfg.tile_colsb[5] = n * 4; \ + cfg.tile_colsb[6] = n * 4; \ + cfg.tile_colsb[7] = n * 4; \ + _tile_loadconfig(&cfg); + +#define T_A0 0 +#define T_A1 1 +#define T_B0 2 +#define T_B1 3 +#define T_C00 4 +#define T_C01 5 +#define T_C10 6 +#define T_C11 7 + +// FIXME: gcc11 seem have problem in tile load/store address calc, +// need to multiply with element size (2 or 4) here. +#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) +#define LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) +#define LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} +#define MASK_LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} + +#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) +#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) +#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) + + +int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOAT * iB, FLOAT * C, BLASLONG ldc) { + /* transport to Row Major matrix for AMX requirement */ + BLASLONG m, n; + IFLOAT *A, *B; + m = in; + n = im; + A = iB; + B = iA; + + printf("kernel: m %d, n %d, k %d, ldc: %d\n", m, n, k, ldc); + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11; + + BLASLONG lda, ldb; + BLASLONG m_count = m; + BLASLONG n_count, k_count; + + IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); + IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); + tilecfg cfg; + + for (; m_count > 31; m_count -= 32) { + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c10 = ptr_c + 16 * ldc; + ptr_c11 = ptr_c10 + 16; + ptr_c += 32 * ldc; + n_count = n; + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + TCONF(cfg, 16, 16, 32); + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + if (k_count > 1) { + /* still have more than 2*k */ + int remain_k2 = k_count & ~1; + k_count -= remain_k2; + lda = remain_k2; + TCONF(cfg, 16, 16, remain_k2); + /* reconfig will clear all tiles, + * need to store/load again + */ + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * remain_k2; + ptr_a1 += 16 * remain_k2; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * remain_k2; + ptr_b1 += 16 * remain_k2; + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + } + if (k_count > 0) { + /* still have odd tail k, need to transform into 2*k */ + TCONF(cfg, 16, 16, 2); + + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + + LOAD_A_TAIL(0, x); LOAD_A_TAIL(1, x); + LOAD_B_TAIL(x, 0); LOAD_B_TAIL(x, 1); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + } + ptr_c00 += 32; + ptr_c01 += 32; + ptr_c10 += 32; + ptr_c11 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, 16, tail_n, 32); + LOAD_C(0, 0); + LOAD_C(1, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + MATMUL(1, 0); + } + STORE_C(0, 0); + STORE_C(1, 0); + if (k_count > 1) { + /* still have more than 2*k */ + int remain_k2 = k_count & ~1; + k_count -= remain_k2; + lda = remain_k2; + TCONF(cfg, 16, tail_n, remain_k2); + /* reconfig will clear all tiles, + * need to store/load again + */ + LOAD_C(0, 0); + LOAD_C(1, 0); + + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * remain_k2; + ptr_a1 += 16 * remain_k2; + LOAD_B(x, 0); + ptr_b0 += tail_n * remain_k2; + + MATMUL(0, 0); + MATMUL(1, 0); + + STORE_C(0, 0); + STORE_C(1, 0); + } + if (k_count > 0) { + /* still have odd tail k, need to transform into 2*k */ + TCONF(cfg, 16, tail_n, 2); + + LOAD_C(0, 0); + LOAD_C(1, 0); + + LOAD_A_TAIL(0, x); LOAD_A_TAIL(1, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + MATMUL(1, 0); + + STORE_C(0, 0); + STORE_C(1, 0); + } + ptr_c00 += tail_n; + ptr_c10 += tail_n; + } + ptr_a += 32 * k; + } return 0; } From a70bfb52d519ced4e6b0d633bfb95d3f4d332fe5 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sun, 12 Sep 2021 19:22:58 -0700 Subject: [PATCH 499/681] sbgemm: spr: kernel works for NN case when alpha is 1.0 --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 135 +++++++++++++++++++++++- kernel/x86_64/sbgemm_oncopy_16_spr.c | 2 +- 2 files changed, 135 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index 41d2634d5..b7b4e36a3 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -82,6 +82,12 @@ typedef struct { _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ } +#define MASK_LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) #define LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ @@ -111,7 +117,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA A = iB; B = iA; - printf("kernel: m %d, n %d, k %d, ldc: %d\n", m, n, k, ldc); IFLOAT *ptr_a = A, *ptr_b = B; IFLOAT *ptr_b0, *ptr_b1; IFLOAT *ptr_a0, *ptr_a1; @@ -279,5 +284,133 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA } ptr_a += 32 * k; } + for (; m_count > 0; m_count -= 16) { + // process at most 16 m at a time + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c += tail_m * ldc; + n_count = n; + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + TCONF(cfg, tail_m, 16, 32); + LOAD_C(0, 0); LOAD_C(0, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + if (k_count > 1) { + /* still have more than 2*k */ + int remain_k2 = k_count & ~1; + k_count -= remain_k2; + lda = remain_k2; + TCONF(cfg, tail_m, 16, remain_k2); + /* reconfig will clear all tiles, + * need to store/load again + */ + LOAD_C(0, 0); LOAD_C(0, 1); + + LOAD_A(0, x); + ptr_a0 += tail_m * remain_k2; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * remain_k2; + ptr_b1 += 16 * remain_k2; + + MATMUL(0, 0); MATMUL(0, 1); + + STORE_C(0, 0); STORE_C(0, 1); + } + if (k_count > 0) { + /* still have odd tail k, need to transform into 2*k */ + TCONF(cfg, tail_m, 16, 2); + + LOAD_C(0, 0); LOAD_C(0, 1); + + MASK_LOAD_A_TAIL(0, x); + LOAD_B_TAIL(x, 0); LOAD_B_TAIL(x, 1); + + MATMUL(0, 0); MATMUL(0, 1); + + STORE_C(0, 0); STORE_C(0, 1); + } + ptr_c00 += 32; + ptr_c01 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, 32); + LOAD_C(0, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + } + STORE_C(0, 0); + if (k_count > 1) { + /* still have more than 2*k */ + int remain_k2 = k_count & ~1; + k_count -= remain_k2; + lda = remain_k2; + TCONF(cfg, tail_m, tail_n, remain_k2); + /* reconfig will clear all tiles, + * need to store/load again + */ + LOAD_C(0, 0); + + LOAD_A(0, x); + ptr_a0 += tail_m * remain_k2; + LOAD_B(x, 0); + ptr_b0 += tail_n * remain_k2; + + MATMUL(0, 0); + + STORE_C(0, 0); + } + if (k_count > 0) { + /* still have odd tail k, need to transform into 2*k */ + TCONF(cfg, tail_m, tail_n, 2); + + LOAD_C(0, 0); + + MASK_LOAD_A_TAIL(0, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + + STORE_C(0, 0); + } + ptr_c00 += tail_n; + } + ptr_a += tail_m * k; + } return 0; } diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c index da353d2c7..f5668e26e 100644 --- a/kernel/x86_64/sbgemm_oncopy_16_spr.c +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -32,8 +32,8 @@ #define MASK_COPY_32(N) _mm512_mask_storeu_epi16(boffset + tail_m * N, mmask, _mm512_maskz_loadu_epi16(mmask, aoffset##N + i)) #define COPY_ODD_TAIL(N) *(boffset + N) = *(aoffset##N + i); + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - printf("ONCOPY: m %d, n %d, lda %d\n", m, n, lda); BLASLONG i, j; IFLOAT *aoffset, *boffset; IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; From 0abbcd19c1589bac3e5d1eae5d87a40535b26510 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Mon, 13 Sep 2021 01:44:53 -0700 Subject: [PATCH 500/681] sbgemm: spr: tuning for blocking params --- param.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/param.h b/param.h index 4e83714d1..c2c6916bc 100644 --- a/param.h +++ b/param.h @@ -1771,6 +1771,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define USE_SGEMM_KERNEL_DIRECT 1 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +// FIXME: actually UNROLL_M = UNROLL_N = 16 +// If M and N is equal, OpenBLAS will reuse OCOPY as ICOPY. +// But for AMX, they are not the same, set UNROLL_M = 32 to workaround +#define SBGEMM_DEFAULT_UNROLL_N 16 +#define SBGEMM_DEFAULT_UNROLL_M 32 +#define SBGEMM_DEFAULT_P 192 +#define SBGEMM_DEFAULT_Q 1024 +#define SBGEMM_DEFAULT_R sbgemm_r + #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 From 88154ed02d14aa6b9048dbcbe29a854d5a691713 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 15 Sep 2021 01:11:15 -0700 Subject: [PATCH 501/681] sbgemm: spr: reduce tile conf loading by seperate tail k handling --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 216 +++++++++--------------- 1 file changed, 80 insertions(+), 136 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index b7b4e36a3..112adbe1c 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -127,10 +127,14 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA BLASLONG m_count = m; BLASLONG n_count, k_count; + IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); tilecfg cfg; + if (k < 32) + goto tail_k; + for (; m_count > 31; m_count -= 32) { ptr_b = B; @@ -140,6 +144,7 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA ptr_c11 = ptr_c10 + 16; ptr_c += 32 * ldc; n_count = n; + TCONF(cfg, 16, 16, 32); for (; n_count > 31; n_count -= 32) { ptr_a0 = ptr_a; ptr_a1 = ptr_a + 16 * k; @@ -150,7 +155,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA lda = 32; ldb = 32; - TCONF(cfg, 16, 16, 32); LOAD_C(0, 0); LOAD_C(0, 1); LOAD_C(1, 0); LOAD_C(1, 1); k_count = k; @@ -167,47 +171,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA } STORE_C(0, 0); STORE_C(0, 1); STORE_C(1, 0); STORE_C(1, 1); - if (k_count > 1) { - /* still have more than 2*k */ - int remain_k2 = k_count & ~1; - k_count -= remain_k2; - lda = remain_k2; - TCONF(cfg, 16, 16, remain_k2); - /* reconfig will clear all tiles, - * need to store/load again - */ - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - LOAD_A(0, x); LOAD_A(1, x); - ptr_a0 += 16 * remain_k2; - ptr_a1 += 16 * remain_k2; - LOAD_B(x, 0); LOAD_B(x, 1); - ptr_b0 += 16 * remain_k2; - ptr_b1 += 16 * remain_k2; - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - } - if (k_count > 0) { - /* still have odd tail k, need to transform into 2*k */ - TCONF(cfg, 16, 16, 2); - - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - - LOAD_A_TAIL(0, x); LOAD_A_TAIL(1, x); - LOAD_B_TAIL(x, 0); LOAD_B_TAIL(x, 1); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - } ptr_c00 += 32; ptr_c01 += 32; ptr_c10 += 32; @@ -240,45 +203,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA } STORE_C(0, 0); STORE_C(1, 0); - if (k_count > 1) { - /* still have more than 2*k */ - int remain_k2 = k_count & ~1; - k_count -= remain_k2; - lda = remain_k2; - TCONF(cfg, 16, tail_n, remain_k2); - /* reconfig will clear all tiles, - * need to store/load again - */ - LOAD_C(0, 0); - LOAD_C(1, 0); - - LOAD_A(0, x); LOAD_A(1, x); - ptr_a0 += 16 * remain_k2; - ptr_a1 += 16 * remain_k2; - LOAD_B(x, 0); - ptr_b0 += tail_n * remain_k2; - - MATMUL(0, 0); - MATMUL(1, 0); - - STORE_C(0, 0); - STORE_C(1, 0); - } - if (k_count > 0) { - /* still have odd tail k, need to transform into 2*k */ - TCONF(cfg, 16, tail_n, 2); - - LOAD_C(0, 0); - LOAD_C(1, 0); - - LOAD_A_TAIL(0, x); LOAD_A_TAIL(1, x); - MASK_LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - MATMUL(1, 0); - - STORE_C(0, 0); - STORE_C(1, 0); - } ptr_c00 += tail_n; ptr_c10 += tail_n; } @@ -295,6 +219,7 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA ptr_c01 = ptr_c00 + 16; ptr_c += tail_m * ldc; n_count = n; + TCONF(cfg, tail_m, 16, 32); for (; n_count > 31; n_count -= 32) { ptr_a0 = ptr_a; @@ -304,7 +229,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA lda = 32; ldb = 32; - TCONF(cfg, tail_m, 16, 32); LOAD_C(0, 0); LOAD_C(0, 1); k_count = k; for (; k_count > 31; k_count -= 32) { @@ -317,40 +241,6 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA MATMUL(0, 0); MATMUL(0, 1); } STORE_C(0, 0); STORE_C(0, 1); - if (k_count > 1) { - /* still have more than 2*k */ - int remain_k2 = k_count & ~1; - k_count -= remain_k2; - lda = remain_k2; - TCONF(cfg, tail_m, 16, remain_k2); - /* reconfig will clear all tiles, - * need to store/load again - */ - LOAD_C(0, 0); LOAD_C(0, 1); - - LOAD_A(0, x); - ptr_a0 += tail_m * remain_k2; - LOAD_B(x, 0); LOAD_B(x, 1); - ptr_b0 += 16 * remain_k2; - ptr_b1 += 16 * remain_k2; - - MATMUL(0, 0); MATMUL(0, 1); - - STORE_C(0, 0); STORE_C(0, 1); - } - if (k_count > 0) { - /* still have odd tail k, need to transform into 2*k */ - TCONF(cfg, tail_m, 16, 2); - - LOAD_C(0, 0); LOAD_C(0, 1); - - MASK_LOAD_A_TAIL(0, x); - LOAD_B_TAIL(x, 0); LOAD_B_TAIL(x, 1); - - MATMUL(0, 0); MATMUL(0, 1); - - STORE_C(0, 0); STORE_C(0, 1); - } ptr_c00 += 32; ptr_c01 += 32; } @@ -376,41 +266,95 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA MATMUL(0, 0); } STORE_C(0, 0); - if (k_count > 1) { - /* still have more than 2*k */ - int remain_k2 = k_count & ~1; - k_count -= remain_k2; - lda = remain_k2; + ptr_c00 += tail_n; + } + ptr_a += tail_m * k; + } + +tail_k: + // process for k < 32 + BLASLONG k32 = k & ~31; + BLASLONG k2 = k & ~1; + int remain_k2 = k2 - k32; + if (remain_k2 > 0) { + m_count = m; + ptr_a = A; + ptr_c = C; + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + TCONF(cfg, tail_m, 16, remain_k2); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k32; + ldb = 2 * tail_n; TCONF(cfg, tail_m, tail_n, remain_k2); - /* reconfig will clear all tiles, - * need to store/load again - */ LOAD_C(0, 0); - LOAD_A(0, x); - ptr_a0 += tail_m * remain_k2; LOAD_B(x, 0); - ptr_b0 += tail_n * remain_k2; - MATMUL(0, 0); - STORE_C(0, 0); } - if (k_count > 0) { - /* still have odd tail k, need to transform into 2*k */ + } + } + if (k2 != k) { + m_count = m; + ptr_a = A; + ptr_c = C; + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + TCONF(cfg, tail_m, 16, 2); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C(0, 0); + MASK_LOAD_A_TAIL(0, x); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k2; TCONF(cfg, tail_m, tail_n, 2); - LOAD_C(0, 0); - MASK_LOAD_A_TAIL(0, x); MASK_LOAD_B_TAIL(x, 0); MATMUL(0, 0); - STORE_C(0, 0); } - ptr_c00 += tail_n; } - ptr_a += tail_m * k; + } return 0; } From 10d52646e2f04e741ccdcf15f4e68f9501ef6c40 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 15 Sep 2021 19:36:02 -0700 Subject: [PATCH 502/681] sbgemm: spr: oncopy: avoid handling too much pointer at a time --- kernel/x86_64/sbgemm_oncopy_16_spr.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c index f5668e26e..593f2433d 100644 --- a/kernel/x86_64/sbgemm_oncopy_16_spr.c +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -49,27 +49,39 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { BLASLONG m2 = m & ~1; for (j = 0; j < n16; j += 16) { + IFLOAT *boffset0 = boffset; aoffset0 = aoffset; aoffset1 = aoffset0 + lda; aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; + for (i = 0; i < m32; i += 32) { + COPY_32(0); COPY_32(1); COPY_32(2); COPY_32(3); + boffset += 32 * 16; + } aoffset4 = aoffset3 + lda; aoffset5 = aoffset4 + lda; aoffset6 = aoffset5 + lda; aoffset7 = aoffset6 + lda; + boffset = boffset0; + for (i = 0; i < m32; i += 32) { + COPY_32(4); COPY_32(5); COPY_32(6); COPY_32(7); + boffset += 32 * 16; + } aoffset8 = aoffset7 + lda; aoffset9 = aoffset8 + lda; aoffset10 = aoffset9 + lda; aoffset11 = aoffset10 + lda; + boffset = boffset0; + for (i = 0; i < m32; i += 32) { + COPY_32(8); COPY_32(9); COPY_32(10); COPY_32(11); + boffset += 32 * 16; + } aoffset12 = aoffset11 + lda; aoffset13 = aoffset12 + lda; aoffset14 = aoffset13 + lda; aoffset15 = aoffset14 + lda; - aoffset += 16 * lda; + boffset = boffset0; for (i = 0; i < m32; i += 32) { - COPY_32(0); COPY_32(1); COPY_32(2); COPY_32(3); - COPY_32(4); COPY_32(5); COPY_32(6); COPY_32(7); - COPY_32(8); COPY_32(9); COPY_32(10); COPY_32(11); COPY_32(12); COPY_32(13); COPY_32(14); COPY_32(15); boffset += 32 * 16; } @@ -91,6 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { COPY_ODD_TAIL(12); COPY_ODD_TAIL(13); COPY_ODD_TAIL(14); COPY_ODD_TAIL(15); boffset += 16; } + aoffset += 16 * lda; } if (j < n) { int remain_n = n - j; From 7b2f5cb3b7378b3111010678bd1433ebdb13d9a6 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 15 Sep 2021 20:29:49 -0700 Subject: [PATCH 503/681] sbgemm: spr: enlarge P to 256 for performance --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index c2c6916bc..23f406d74 100644 --- a/param.h +++ b/param.h @@ -1781,7 +1781,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // But for AMX, they are not the same, set UNROLL_M = 32 to workaround #define SBGEMM_DEFAULT_UNROLL_N 16 #define SBGEMM_DEFAULT_UNROLL_M 32 -#define SBGEMM_DEFAULT_P 192 +#define SBGEMM_DEFAULT_P 256 #define SBGEMM_DEFAULT_Q 1024 #define SBGEMM_DEFAULT_R sbgemm_r From 9ab33228bbf9301c34b3dedfa4047cbfee9bb847 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Wed, 15 Sep 2021 23:59:38 -0700 Subject: [PATCH 504/681] sbgemm: spr: process k2 and odd k at the same time --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 206 +++++++++++++++--------- 1 file changed, 133 insertions(+), 73 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index 112adbe1c..dbfacd6ab 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -64,6 +64,28 @@ typedef struct { cfg.tile_colsb[7] = n * 4; \ _tile_loadconfig(&cfg); +/* CONFIG for handling k2 and odd tail at the same time + * tile0 -- A (m x 2k) + * tile1 -- A (m x 1) + * tile2 -- B (2k x n) + * tile3 -- B (1 x n) + * tile4 -- C (m x n) + */ +#define TCONF_TAIL(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = 1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = 4; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + _tile_loadconfig(&cfg); + #define T_A0 0 #define T_A1 1 #define T_B0 2 @@ -104,6 +126,7 @@ typedef struct { #define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) +#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N) #define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) @@ -275,86 +298,123 @@ tail_k: // process for k < 32 BLASLONG k32 = k & ~31; BLASLONG k2 = k & ~1; - int remain_k2 = k2 - k32; - if (remain_k2 > 0) { + if (k32 != k) { + int remain_k2 = k2 - k32; m_count = m; ptr_a = A; ptr_c = C; - for (; m_count > 0; m_count -= 16) { - int tail_m = (m_count > 16) ? 16: m_count; - __mmask16 amask = (1UL << tail_m) - 1; - - ptr_a0 = ptr_a + tail_m * k32; - ptr_a += tail_m * k; - ptr_b = B; - ptr_c00 = ptr_c; - ptr_c += tail_m * ldc; - n_count = n; - lda = remain_k2; - ldb = 32; - TCONF(cfg, tail_m, 16, remain_k2); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k32; - LOAD_C(0, 0); - LOAD_A(0, x); - LOAD_B(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; - } - if (n_count > 0) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_b0 = ptr_b + tail_n * k32; - ldb = 2 * tail_n; - TCONF(cfg, tail_m, tail_n, remain_k2); - LOAD_C(0, 0); - LOAD_A(0, x); - LOAD_B(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); + if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0) + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a1 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + TCONF_TAIL(cfg, tail_m, 16, remain_k2); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + ptr_b1 = ptr_b + 16 * k2; + LOAD_C(0, 0); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + LOAD_B(x, 0); LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k32; + ptr_b1 = ptr_b + tail_n * k2; + ldb = 2 * tail_n; + TCONF_TAIL(cfg, tail_m, tail_n, remain_k2); + LOAD_C(0, 0); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + } } - } - } - if (k2 != k) { - m_count = m; - ptr_a = A; - ptr_c = C; - for (; m_count > 0; m_count -= 16) { - int tail_m = (m_count > 16) ? 16: m_count; - __mmask16 amask = (1UL << tail_m) - 1; - - ptr_a0 = ptr_a + tail_m * k2; - ptr_a += tail_m * k; - ptr_b = B; - ptr_c00 = ptr_c; - ptr_c += tail_m * ldc; - n_count = n; - TCONF(cfg, tail_m, 16, 2); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k2; - LOAD_C(0, 0); - MASK_LOAD_A_TAIL(0, x); - LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; + + } else if (remain_k2 > 0) { // k%32 = 2x + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + TCONF(cfg, tail_m, 16, remain_k2); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_b0 = ptr_b + tail_n * k32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, remain_k2); + LOAD_C(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } } - if (n_count > 0) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_b0 = ptr_b + tail_n * k2; - TCONF(cfg, tail_m, tail_n, 2); - LOAD_C(0, 0); - MASK_LOAD_A_TAIL(0, x); - MASK_LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); + } else { // k%32 = 1 + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + TCONF(cfg, tail_m, 16, 2); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C(0, 0); + MASK_LOAD_A_TAIL(0, x); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k2; + TCONF(cfg, tail_m, tail_n, 2); + LOAD_C(0, 0); + MASK_LOAD_A_TAIL(0, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } } - } + } } return 0; } From f2485352a603f11bd3c7b45788c1c80449ba76eb Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 16 Sep 2021 01:04:01 -0700 Subject: [PATCH 505/681] sbgemm: spr: only load A once in tail_k handling --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 62 ++++++++++++++----------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index dbfacd6ab..b34035896 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -317,17 +317,19 @@ tail_k: n_count = n; lda = remain_k2; ldb = 32; - TCONF_TAIL(cfg, tail_m, 16, remain_k2); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k32; - ptr_b1 = ptr_b + 16 * k2; - LOAD_C(0, 0); + if (n_count > 15) { + TCONF_TAIL(cfg, tail_m, 16, remain_k2); LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); - LOAD_B(x, 0); LOAD_B_TAIL(x, 1); - MATMUL(0, 0); MATMUL_TAIL(1, 1); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + ptr_b1 = ptr_b + 16 * k2; + LOAD_C(0, 0); + LOAD_B(x, 0); LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } } if (n_count > 0) { int tail_n = (n_count > 16) ? 16: n_count; @@ -356,16 +358,18 @@ tail_k: n_count = n; lda = remain_k2; ldb = 32; - TCONF(cfg, tail_m, 16, remain_k2); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k32; - LOAD_C(0, 0); + if (n_count > 15) { + TCONF(cfg, tail_m, 16, remain_k2); LOAD_A(0, x); - LOAD_B(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C(0, 0); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } } if (n_count > 0) { int tail_n = (n_count > 16) ? 16: n_count; @@ -390,16 +394,18 @@ tail_k: ptr_c00 = ptr_c; ptr_c += tail_m * ldc; n_count = n; - TCONF(cfg, tail_m, 16, 2); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k2; - LOAD_C(0, 0); + if (n_count > 15) { + TCONF(cfg, tail_m, 16, 2); MASK_LOAD_A_TAIL(0, x); - LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C(0, 0); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } } if (n_count > 0) { int tail_n = (n_count > 16) ? 16: n_count; From a52456b168897ab252d1be968ff2040d8d909296 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 16 Sep 2021 20:08:42 -0700 Subject: [PATCH 506/681] sbgemm: spr: oncopy: use tile load/store instead --- kernel/x86_64/sbgemm_oncopy_16_spr.c | 180 +++++++++------------------ 1 file changed, 59 insertions(+), 121 deletions(-) diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c index 593f2433d..ccb00ada1 100644 --- a/kernel/x86_64/sbgemm_oncopy_16_spr.c +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -28,18 +28,45 @@ #include #include "common.h" -#define COPY_32(N) _mm512_storeu_si512(boffset + 32 * N, _mm512_loadu_si512(aoffset##N + i)) -#define MASK_COPY_32(N) _mm512_mask_storeu_epi16(boffset + tail_m * N, mmask, _mm512_maskz_loadu_epi16(mmask, aoffset##N + i)) -#define COPY_ODD_TAIL(N) *(boffset + N) = *(aoffset##N + i); +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +#define T_16x32 0 +#define T_16xm 1 +#define T_nx32 2 +#define T_nxm 3 + +#define TCONF(cfg, m, n) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[T_16x32] = 16; \ + cfg.tile_colsb[T_16x32] = 64; \ + if (m) { \ + cfg.tile_rows[T_16xm] = 16; \ + cfg.tile_colsb[T_16xm] = m * 2; \ + } \ + if (n) { \ + cfg.tile_rows[T_nx32] = n; \ + cfg.tile_colsb[T_nx32] = 64; \ + } \ + if (m && n) { \ + cfg.tile_rows[T_nxm] = n; \ + cfg.tile_colsb[T_nxm] = m * 2; \ + } \ + _tile_loadconfig(&cfg); int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { BLASLONG i, j; IFLOAT *aoffset, *boffset; - IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; - IFLOAT *aoffset4, *aoffset5, *aoffset6, *aoffset7; - IFLOAT *aoffset8, *aoffset9, *aoffset10, *aoffset11; - IFLOAT *aoffset12, *aoffset13, *aoffset14, *aoffset15; + IFLOAT *aoffset0; aoffset = a; boffset = b; @@ -48,141 +75,52 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { BLASLONG m32 = m & ~31; BLASLONG m2 = m & ~1; + BLASLONG tail_m = m2 - m32; + BLASLONG tail_n = n - n16; + tilecfg cfg; + TCONF(cfg, tail_m, tail_n); + for (j = 0; j < n16; j += 16) { - IFLOAT *boffset0 = boffset; aoffset0 = aoffset; - aoffset1 = aoffset0 + lda; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - for (i = 0; i < m32; i += 32) { - COPY_32(0); COPY_32(1); COPY_32(2); COPY_32(3); - boffset += 32 * 16; - } - aoffset4 = aoffset3 + lda; - aoffset5 = aoffset4 + lda; - aoffset6 = aoffset5 + lda; - aoffset7 = aoffset6 + lda; - boffset = boffset0; - for (i = 0; i < m32; i += 32) { - COPY_32(4); COPY_32(5); COPY_32(6); COPY_32(7); - boffset += 32 * 16; - } - aoffset8 = aoffset7 + lda; - aoffset9 = aoffset8 + lda; - aoffset10 = aoffset9 + lda; - aoffset11 = aoffset10 + lda; - boffset = boffset0; for (i = 0; i < m32; i += 32) { - COPY_32(8); COPY_32(9); COPY_32(10); COPY_32(11); - boffset += 32 * 16; - } - aoffset12 = aoffset11 + lda; - aoffset13 = aoffset12 + lda; - aoffset14 = aoffset13 + lda; - aoffset15 = aoffset14 + lda; - boffset = boffset0; - for (i = 0; i < m32; i += 32) { - COPY_32(12); COPY_32(13); COPY_32(14); COPY_32(15); + _tile_loadd(T_16x32, aoffset0, lda * 2); + _tile_stored(T_16x32, boffset, 32 * 2); + aoffset0 += 32; boffset += 32 * 16; } if (i < m2) { - int tail_m = m2 - i; - __mmask32 mmask = (1UL << tail_m) - 1; - MASK_COPY_32(0); MASK_COPY_32(1); MASK_COPY_32(2); MASK_COPY_32(3); - MASK_COPY_32(4); MASK_COPY_32(5); MASK_COPY_32(6); MASK_COPY_32(7); - MASK_COPY_32(8); MASK_COPY_32(9); MASK_COPY_32(10); MASK_COPY_32(11); - MASK_COPY_32(12); MASK_COPY_32(13); MASK_COPY_32(14); MASK_COPY_32(15); - i = m2; + _tile_loadd(T_16xm, aoffset0, lda * 2); + _tile_stored(T_16xm, boffset, tail_m * 2); + aoffset0 += tail_m; boffset += tail_m * 16; + i = m2; } if (i < m) { /* the tail odd k should put alone */ - COPY_ODD_TAIL(0); COPY_ODD_TAIL(1); COPY_ODD_TAIL(2); COPY_ODD_TAIL(3); - COPY_ODD_TAIL(4); COPY_ODD_TAIL(5); COPY_ODD_TAIL(6); COPY_ODD_TAIL(7); - COPY_ODD_TAIL(8); COPY_ODD_TAIL(9); COPY_ODD_TAIL(10); COPY_ODD_TAIL(11); - COPY_ODD_TAIL(12); COPY_ODD_TAIL(13); COPY_ODD_TAIL(14); COPY_ODD_TAIL(15); + for (int ii = 0; ii < 16; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); + } boffset += 16; } aoffset += 16 * lda; } if (j < n) { - int remain_n = n - j; aoffset0 = aoffset; - aoffset1 = aoffset0 + lda; - aoffset2 = aoffset1 + lda; - aoffset3 = aoffset2 + lda; - aoffset4 = aoffset3 + lda; - aoffset5 = aoffset4 + lda; - aoffset6 = aoffset5 + lda; - aoffset7 = aoffset6 + lda; - aoffset8 = aoffset7 + lda; - aoffset9 = aoffset8 + lda; - aoffset10 = aoffset9 + lda; - aoffset11 = aoffset10 + lda; - aoffset12 = aoffset11 + lda; - aoffset13 = aoffset12 + lda; - aoffset14 = aoffset13 + lda; - aoffset15 = aoffset14 + lda; for (i = 0; i < m32; i += 32) { - switch(remain_n) { - case 15: COPY_32(14); - case 14: COPY_32(13); - case 13: COPY_32(12); - case 12: COPY_32(11); - case 11: COPY_32(10); - case 10: COPY_32(9); - case 9: COPY_32(8); - case 8: COPY_32(7); - case 7: COPY_32(6); - case 6: COPY_32(5); - case 5: COPY_32(4); - case 4: COPY_32(3); - case 3: COPY_32(2); - case 2: COPY_32(1); - case 1: COPY_32(0); - } - boffset += 32 * remain_n; + _tile_loadd(T_nx32, aoffset0, lda * 2); + _tile_stored(T_nx32, boffset, 32 * 2); + aoffset0 += 32; + boffset += 32 * tail_n; } if (i < m2) { - int tail_m = m2 - i; - __mmask32 mmask = (1UL << tail_m) - 1; - switch(remain_n) { - case 15: MASK_COPY_32(14); - case 14: MASK_COPY_32(13); - case 13: MASK_COPY_32(12); - case 12: MASK_COPY_32(11); - case 11: MASK_COPY_32(10); - case 10: MASK_COPY_32(9); - case 9: MASK_COPY_32(8); - case 8: MASK_COPY_32(7); - case 7: MASK_COPY_32(6); - case 6: MASK_COPY_32(5); - case 5: MASK_COPY_32(4); - case 4: MASK_COPY_32(3); - case 3: MASK_COPY_32(2); - case 2: MASK_COPY_32(1); - case 1: MASK_COPY_32(0); - } - i = m2; - boffset += tail_m * remain_n; + _tile_loadd(T_nxm, aoffset0, lda * 2); + _tile_stored(T_nxm, boffset, tail_m * 2); + aoffset0 += tail_m; + boffset += tail_m * tail_n; } if (i < m) { - switch(remain_n) { - case 15: COPY_ODD_TAIL(14); - case 14: COPY_ODD_TAIL(13); - case 13: COPY_ODD_TAIL(12); - case 12: COPY_ODD_TAIL(11); - case 11: COPY_ODD_TAIL(10); - case 10: COPY_ODD_TAIL(9); - case 9: COPY_ODD_TAIL(8); - case 8: COPY_ODD_TAIL(7); - case 7: COPY_ODD_TAIL(6); - case 6: COPY_ODD_TAIL(5); - case 5: COPY_ODD_TAIL(4); - case 4: COPY_ODD_TAIL(3); - case 3: COPY_ODD_TAIL(2); - case 2: COPY_ODD_TAIL(1); - case 1: COPY_ODD_TAIL(0); + for (int ii = 0; ii < tail_n; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); } } } From f018aa342a5d97603a5f59fa1feb36e1c77e0571 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 17 Sep 2021 00:48:52 -0700 Subject: [PATCH 507/681] sbgemm: spr: kernel handle alpha != 1.0 --- kernel/x86_64/sbgemm_kernel_16x16_spr.c | 392 +------------- kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c | 521 +++++++++++++++++++ 2 files changed, 529 insertions(+), 384 deletions(-) create mode 100644 kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c index b34035896..955db3163 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -25,109 +25,12 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ -#include -#include #include "common.h" -typedef struct { - char palette_id; - char start_row; - char dummy0[14]; // bytes 2-15 reserved, must be zero - short tile_colsb[8]; - char dummy1[16]; // bytes 32-47 reserved, must be zero - char tile_rows[8]; - char dummy2[16]; // bytes 56-63 reserved, must be zero -} tilecfg; - -/* tile0/tile1 -- A (m x 2k) - * tile2/tile3 -- B (2k x n) - * tile4-7 -- C (m x n) - */ -#define TCONF(cfg, m, n, k2) \ - memset(&cfg, 0, sizeof(tilecfg)); \ - cfg.palette_id = 1; \ - cfg.tile_rows[0] = m; \ - cfg.tile_rows[1] = m; \ - cfg.tile_rows[2] = k2>>1; \ - cfg.tile_rows[3] = k2>>1; \ - cfg.tile_rows[4] = m; \ - cfg.tile_rows[5] = m; \ - cfg.tile_rows[6] = m; \ - cfg.tile_rows[7] = m; \ - cfg.tile_colsb[0] = k2<<1; \ - cfg.tile_colsb[1] = k2<<1; \ - cfg.tile_colsb[2] = n * 4; \ - cfg.tile_colsb[3] = n * 4; \ - cfg.tile_colsb[4] = n * 4; \ - cfg.tile_colsb[5] = n * 4; \ - cfg.tile_colsb[6] = n * 4; \ - cfg.tile_colsb[7] = n * 4; \ - _tile_loadconfig(&cfg); - -/* CONFIG for handling k2 and odd tail at the same time - * tile0 -- A (m x 2k) - * tile1 -- A (m x 1) - * tile2 -- B (2k x n) - * tile3 -- B (1 x n) - * tile4 -- C (m x n) - */ -#define TCONF_TAIL(cfg, m, n, k2) \ - memset(&cfg, 0, sizeof(tilecfg)); \ - cfg.palette_id = 1; \ - cfg.tile_rows[0] = m; \ - cfg.tile_rows[1] = m; \ - cfg.tile_rows[2] = k2>>1; \ - cfg.tile_rows[3] = 1; \ - cfg.tile_rows[4] = m; \ - cfg.tile_colsb[0] = k2<<1; \ - cfg.tile_colsb[1] = 4; \ - cfg.tile_colsb[2] = n * 4; \ - cfg.tile_colsb[3] = n * 4; \ - cfg.tile_colsb[4] = n * 4; \ - _tile_loadconfig(&cfg); - -#define T_A0 0 -#define T_A1 1 -#define T_B0 2 -#define T_B1 3 -#define T_C00 4 -#define T_C01 5 -#define T_C10 6 -#define T_C11 7 - -// FIXME: gcc11 seem have problem in tile load/store address calc, -// need to multiply with element size (2 or 4) here. -#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) -#define LOAD_A_TAIL(M, N) {\ - __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ - __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ - _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ -} -#define MASK_LOAD_A_TAIL(M, N) {\ - __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ - __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ - _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ -} -#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) -#define LOAD_B_TAIL(M, N) {\ - __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ - __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ - _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ -} -#define MASK_LOAD_B_TAIL(M, N) {\ - __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ - __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ - _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ -} - -#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) -#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) -#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N) -#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) +#define ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" +#undef ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOAT * iB, FLOAT * C, BLASLONG ldc) @@ -140,287 +43,8 @@ int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOA A = iB; B = iA; - IFLOAT *ptr_a = A, *ptr_b = B; - IFLOAT *ptr_b0, *ptr_b1; - IFLOAT *ptr_a0, *ptr_a1; - FLOAT *ptr_c = C; - FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11; - - BLASLONG lda, ldb; - BLASLONG m_count = m; - BLASLONG n_count, k_count; - - - IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); - IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); - tilecfg cfg; - - if (k < 32) - goto tail_k; - - for (; m_count > 31; m_count -= 32) { - ptr_b = B; - - ptr_c00 = ptr_c; - ptr_c01 = ptr_c00 + 16; - ptr_c10 = ptr_c + 16 * ldc; - ptr_c11 = ptr_c10 + 16; - ptr_c += 32 * ldc; - n_count = n; - TCONF(cfg, 16, 16, 32); - for (; n_count > 31; n_count -= 32) { - ptr_a0 = ptr_a; - ptr_a1 = ptr_a + 16 * k; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b + 16 * k; - ptr_b += 32 * k; - - lda = 32; - ldb = 32; - LOAD_C(0, 0); LOAD_C(0, 1); - LOAD_C(1, 0); LOAD_C(1, 1); - k_count = k; - for (; k_count > 31; k_count -= 32) { - LOAD_A(0, x); LOAD_A(1, x); - ptr_a0 += 16 * 32; - ptr_a1 += 16 * 32; - LOAD_B(x, 0); LOAD_B(x, 1); - ptr_b0 += 16 * 32; - ptr_b1 += 16 * 32; - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - } - STORE_C(0, 0); STORE_C(0, 1); - STORE_C(1, 0); STORE_C(1, 1); - ptr_c00 += 32; - ptr_c01 += 32; - ptr_c10 += 32; - ptr_c11 += 32; - } - for (; n_count > 0; n_count -= 16) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_a0 = ptr_a; - ptr_a1 = ptr_a + 16 * k; - - ptr_b0 = ptr_b; - ptr_b += tail_n * k; - - lda = 32; - ldb = 2 * tail_n; - TCONF(cfg, 16, tail_n, 32); - LOAD_C(0, 0); - LOAD_C(1, 0); - k_count = k; - for (; k_count > 31; k_count -= 32) { - LOAD_A(0, x); LOAD_A(1, x); - ptr_a0 += 16 * 32; - ptr_a1 += 16 * 32; - LOAD_B(x, 0); - ptr_b0 += tail_n * 32; - - MATMUL(0, 0); - MATMUL(1, 0); - } - STORE_C(0, 0); - STORE_C(1, 0); - ptr_c00 += tail_n; - ptr_c10 += tail_n; - } - ptr_a += 32 * k; - } - for (; m_count > 0; m_count -= 16) { - // process at most 16 m at a time - int tail_m = (m_count > 16) ? 16: m_count; - __mmask16 amask = (1UL << tail_m) - 1; - - ptr_b = B; - - ptr_c00 = ptr_c; - ptr_c01 = ptr_c00 + 16; - ptr_c += tail_m * ldc; - n_count = n; - TCONF(cfg, tail_m, 16, 32); - for (; n_count > 31; n_count -= 32) { - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - ptr_b1 = ptr_b + 16 * k; - ptr_b += 32 * k; - - lda = 32; - ldb = 32; - LOAD_C(0, 0); LOAD_C(0, 1); - k_count = k; - for (; k_count > 31; k_count -= 32) { - LOAD_A(0, x); - ptr_a0 += tail_m * 32; - LOAD_B(x, 0); LOAD_B(x, 1); - ptr_b0 += 16 * 32; - ptr_b1 += 16 * 32; - - MATMUL(0, 0); MATMUL(0, 1); - } - STORE_C(0, 0); STORE_C(0, 1); - ptr_c00 += 32; - ptr_c01 += 32; - } - for (; n_count > 0; n_count -= 16) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_a0 = ptr_a; - - ptr_b0 = ptr_b; - ptr_b += tail_n * k; - - lda = 32; - ldb = 2 * tail_n; - TCONF(cfg, tail_m, tail_n, 32); - LOAD_C(0, 0); - k_count = k; - for (; k_count > 31; k_count -= 32) { - LOAD_A(0, x); - ptr_a0 += tail_m * 32; - LOAD_B(x, 0); - ptr_b0 += tail_n * 32; - - MATMUL(0, 0); - } - STORE_C(0, 0); - ptr_c00 += tail_n; - } - ptr_a += tail_m * k; - } - -tail_k: - // process for k < 32 - BLASLONG k32 = k & ~31; - BLASLONG k2 = k & ~1; - if (k32 != k) { - int remain_k2 = k2 - k32; - m_count = m; - ptr_a = A; - ptr_c = C; - if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0) - for (; m_count > 0; m_count -= 16) { - int tail_m = (m_count > 16) ? 16: m_count; - __mmask16 amask = (1UL << tail_m) - 1; - - ptr_a0 = ptr_a + tail_m * k32; - ptr_a1 = ptr_a + tail_m * k2; - ptr_a += tail_m * k; - ptr_b = B; - ptr_c00 = ptr_c; - ptr_c += tail_m * ldc; - n_count = n; - lda = remain_k2; - ldb = 32; - if (n_count > 15) { - TCONF_TAIL(cfg, tail_m, 16, remain_k2); - LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k32; - ptr_b1 = ptr_b + 16 * k2; - LOAD_C(0, 0); - LOAD_B(x, 0); LOAD_B_TAIL(x, 1); - MATMUL(0, 0); MATMUL_TAIL(1, 1); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; - } - } - if (n_count > 0) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_b0 = ptr_b + tail_n * k32; - ptr_b1 = ptr_b + tail_n * k2; - ldb = 2 * tail_n; - TCONF_TAIL(cfg, tail_m, tail_n, remain_k2); - LOAD_C(0, 0); - LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); - LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1); - MATMUL(0, 0); MATMUL_TAIL(1, 1); - STORE_C(0, 0); - } - } - - } else if (remain_k2 > 0) { // k%32 = 2x - for (; m_count > 0; m_count -= 16) { - int tail_m = (m_count > 16) ? 16: m_count; - - ptr_a0 = ptr_a + tail_m * k32; - ptr_a += tail_m * k; - ptr_b = B; - ptr_c00 = ptr_c; - ptr_c += tail_m * ldc; - n_count = n; - lda = remain_k2; - ldb = 32; - if (n_count > 15) { - TCONF(cfg, tail_m, 16, remain_k2); - LOAD_A(0, x); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k32; - LOAD_C(0, 0); - LOAD_B(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; - } - } - if (n_count > 0) { - int tail_n = (n_count > 16) ? 16: n_count; - ptr_b0 = ptr_b + tail_n * k32; - ldb = 2 * tail_n; - TCONF(cfg, tail_m, tail_n, remain_k2); - LOAD_C(0, 0); - LOAD_A(0, x); - LOAD_B(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - } - } - } else { // k%32 = 1 - for (; m_count > 0; m_count -= 16) { - int tail_m = (m_count > 16) ? 16: m_count; - __mmask16 amask = (1UL << tail_m) - 1; - - ptr_a0 = ptr_a + tail_m * k2; - ptr_a += tail_m * k; - ptr_b = B; - ptr_c00 = ptr_c; - ptr_c += tail_m * ldc; - n_count = n; - if (n_count > 15) { - TCONF(cfg, tail_m, 16, 2); - MASK_LOAD_A_TAIL(0, x); - for (; n_count > 15; n_count -= 16) { - ptr_b0 = ptr_b + 16 * k2; - LOAD_C(0, 0); - LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - ptr_b += 16 * k; - ptr_c00 += 16; - } - } - if (n_count > 0) { - int tail_n = (n_count > 16) ? 16: n_count; - __mmask16 bmask = (1UL << tail_n) - 1; - ptr_b0 = ptr_b + tail_n * k2; - TCONF(cfg, tail_m, tail_n, 2); - LOAD_C(0, 0); - MASK_LOAD_A_TAIL(0, x); - MASK_LOAD_B_TAIL(x, 0); - MATMUL(0, 0); - STORE_C(0, 0); - } - } - - } - } - return 0; + if (alpha == 1.0f) + return sbgemm_kernel_spr_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_spr_alpha(m, n, k, alpha, A, B, C, ldc); } diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c new file mode 100644 index 000000000..465b9eb75 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -0,0 +1,521 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include +#include "common.h" + +#ifndef SBGEMM_KERNEL_SPR +#define SBGEMM_KERNEL_SPR +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +/* tile0/tile1 -- A (m x 2k) + * tile2/tile3 -- B (2k x n) + * tile4-7 -- C (m x n) + */ +#define TCONF(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = k2>>1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_rows[5] = m; \ + cfg.tile_rows[6] = m; \ + cfg.tile_rows[7] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = k2<<1; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + cfg.tile_colsb[5] = n * 4; \ + cfg.tile_colsb[6] = n * 4; \ + cfg.tile_colsb[7] = n * 4; \ + _tile_loadconfig(&cfg); + +/* CONFIG for handling k2 and odd tail at the same time + * tile0 -- A (m x 2k) + * tile1 -- A (m x 1) + * tile2 -- B (2k x n) + * tile3 -- B (1 x n) + * tile4 -- C (m x n) + */ +#define TCONF_TAIL(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = 1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = 4; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + _tile_loadconfig(&cfg); + +#define T_A0 0 +#define T_A1 1 +#define T_B0 2 +#define T_B1 3 +#define T_C00 4 +#define T_C01 5 +#define T_C10 6 +#define T_C11 7 + +// FIXME: gcc11 seem have problem in tile load/store address calc, +// need to multiply with element size (2 or 4) here. +#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) +#define LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define MASK_LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) +#define LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} +#define MASK_LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} + +#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) +#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N) +#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) +#define LOAD_C_F(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) + +#endif // end of SBGEMM_KERNEL_SPR + +#ifdef ALPHA_ONE +#undef LOAD_C +#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) +#else +#undef LOAD_C +#define LOAD_C(M, N) _tile_zero(T_C##M##N) +#define ALPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_loadu_ps(dst##N + noffset); \ + __m512 zmm_s##N = _mm512_loadu_ps(src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_storeu_ps(dst##N + noffset, zmm_d##N); +#define MASK_APLPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_maskz_loadu_ps(mask, dst##N + noffset); \ + __m512 zmm_s##N = _mm512_maskz_loadu_ps(mask, src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_mask_storeu_ps(dst##N + noffset, mask, zmm_d##N); +#endif // end of ALPHA_ONE + + +#ifdef ALPHA_ONE +int sbgemm_kernel_spr_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + /* Row Major matrix for AMX requirement */ + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11; + + BLASLONG lda, ldb; + BLASLONG m_count = m; + BLASLONG n_count, k_count; + +#ifndef ALPHA_ONE + FLOAT *tmp_c = malloc(sizeof(FLOAT) * m * n); + memset(tmp_c, 0, sizeof(FLOAT) * m * n); + ptr_c = tmp_c; + BLASLONG ldc_o = ldc; + ldc = n; +#endif + IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); + IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); + tilecfg cfg; + + if (k > 31) { + for (; m_count > 31; m_count -= 32) { + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c10 = ptr_c + 16 * ldc; + ptr_c11 = ptr_c10 + 16; + ptr_c += 32 * ldc; + n_count = n; + TCONF(cfg, 16, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + ptr_c00 += 32; + ptr_c01 += 32; + ptr_c10 += 32; + ptr_c11 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, 16, tail_n, 32); + LOAD_C(0, 0); + LOAD_C(1, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + MATMUL(1, 0); + } + STORE_C(0, 0); + STORE_C(1, 0); + ptr_c00 += tail_n; + ptr_c10 += tail_n; + } + ptr_a += 32 * k; + } + for (; m_count > 0; m_count -= 16) { + // process at most 16 m at a time + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c += tail_m * ldc; + n_count = n; + TCONF(cfg, tail_m, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + ptr_c00 += 32; + ptr_c01 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, 32); + LOAD_C(0, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + } + STORE_C(0, 0); + ptr_c00 += tail_n; + } + ptr_a += tail_m * k; + } + } + + // process for k < 32 + BLASLONG k32 = k & ~31; + BLASLONG k2 = k & ~1; + if (k32 != k) { + int remain_k2 = k2 - k32; + m_count = m; + ptr_a = A; +#ifndef ALPHA_ONE + ptr_c = tmp_c; +#else + ptr_c = C; +#endif + if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0) + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a1 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF_TAIL(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + ptr_b1 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B(x, 0); LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k32; + ptr_b1 = ptr_b + tail_n * k2; + ldb = 2 * tail_n; + TCONF_TAIL(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + } + } + + } else if (remain_k2 > 0) { // k%32 = 2x + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C_F(0, 0); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_b0 = ptr_b + tail_n * k32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + } else { // k%32 = 1 + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, 2); + MASK_LOAD_A_TAIL(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k2; + TCONF(cfg, tail_m, tail_n, 2); + LOAD_C_F(0, 0); + MASK_LOAD_A_TAIL(0, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + + } + } +#ifndef ALPHA_ONE + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); + BLASLONG n16 = n & ~15; + BLASLONG noffset; + FLOAT *src0, *src1, *src2, *src3; + FLOAT *dst0, *dst1, *dst2, *dst3; + FLOAT *src = tmp_c; + FLOAT *dst = C; + m_count = m; + for (; m_count > 3; m_count -= 4) { + src0 = src; + src1 = src0 + ldc; + src2 = src1 + ldc; + src3 = src2 + ldc; + src += 4 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst2 = dst1 + ldc_o; + dst3 = dst2 + ldc_o; + dst += 4 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + ALPHA_STORE(2); + ALPHA_STORE(3); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + MASK_APLPHA_STORE(2); + MASK_APLPHA_STORE(3); + } + } + for (; m_count > 1; m_count -= 2) { + src0 = src; + src1 = src0 + ldc; + src += 2 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst += 2 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + } + } + for (; m_count > 0; m_count -= 1) { + src0 = src; + dst0 = dst; + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + } + } + free(tmp_c); +#endif + return 0; +} From 6bc8204ce5c137cc18c2580eabc30264d4b8b2fe Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Fri, 17 Sep 2021 23:59:32 -0700 Subject: [PATCH 508/681] sbgemm: spr: optimization for tmp_c buffer --- kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c index 465b9eb75..90e0a32c7 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -170,11 +170,20 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL BLASLONG n_count, k_count; #ifndef ALPHA_ONE - FLOAT *tmp_c = malloc(sizeof(FLOAT) * m * n); - memset(tmp_c, 0, sizeof(FLOAT) * m * n); + // make sure each row is 64 bytes aligned + BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n; + FLOAT *raw_tmp_c; + if (k < 32) { + // only need to zero buff in this situation + raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64); + } else { + raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64); + } + // align buf to 64 byte boundary + FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63); ptr_c = tmp_c; BLASLONG ldc_o = ldc; - ldc = n; + ldc = cn; #endif IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); @@ -515,7 +524,7 @@ int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFL MASK_APLPHA_STORE(0); } } - free(tmp_c); + free(raw_tmp_c); #endif return 0; } From 8632380a96f9172a6bba4610d9014faf9fb0cd74 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Sat, 18 Sep 2021 01:11:31 -0700 Subject: [PATCH 509/681] sbgemm: spr: reuse ncopy_16 from cooperlake as incopy --- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 +- kernel/x86_64/sbgemm_incopy_16_spr.c | 32 ---------------------------- 2 files changed, 1 insertion(+), 33 deletions(-) delete mode 100644 kernel/x86_64/sbgemm_incopy_16_spr.c diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 3f67640cb..e061b913d 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -2,7 +2,7 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c -SBGEMMINCOPY = sbgemm_incopy_16_spr.c +SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c SBGEMMONCOPY = sbgemm_oncopy_16_spr.c SBGEMMOTCOPY = sbgemm_otcopy_16_spr.c diff --git a/kernel/x86_64/sbgemm_incopy_16_spr.c b/kernel/x86_64/sbgemm_incopy_16_spr.c deleted file mode 100644 index 2f57ae7b6..000000000 --- a/kernel/x86_64/sbgemm_incopy_16_spr.c +++ /dev/null @@ -1,32 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2021, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE - * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - return 0; -} From 82194ea9d2c8bc2f3e8521421904eeb3419c3ab3 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Thu, 23 Sep 2021 01:08:40 -0700 Subject: [PATCH 510/681] sbgemm: spr: implement otcopy_16 --- kernel/x86_64/sbgemm_otcopy_16_spr.c | 270 +++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) diff --git a/kernel/x86_64/sbgemm_otcopy_16_spr.c b/kernel/x86_64/sbgemm_otcopy_16_spr.c index 2f57ae7b6..b5d5d38fb 100644 --- a/kernel/x86_64/sbgemm_otcopy_16_spr.c +++ b/kernel/x86_64/sbgemm_otcopy_16_spr.c @@ -25,8 +25,278 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ +#include #include "common.h" +#define LOAD_A_8VEC(aptr) \ + r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); + +#define MASK_LOAD_A_8VEC(aptr) \ + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); + +#define SWITCH_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + } + +#define SWITCH_MASK_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + } + +#define REORDER_8x16(t0, t1, t2, t3, t4, t5, t6, t7) \ + t0 = _mm256_unpacklo_epi16(r0, r1); \ + t1 = _mm256_unpackhi_epi16(r0, r1); \ + t2 = _mm256_unpacklo_epi16(r2, r3); \ + t3 = _mm256_unpackhi_epi16(r2, r3); \ + t4 = _mm256_unpacklo_epi16(r4, r5); \ + t5 = _mm256_unpackhi_epi16(r4, r5); \ + t6 = _mm256_unpacklo_epi16(r6, r7); \ + t7 = _mm256_unpackhi_epi16(r6, r7); \ + r0 = _mm256_unpacklo_epi32(t0, t2); \ + r1 = _mm256_unpacklo_epi32(t1, t3); \ + r2 = _mm256_unpacklo_epi32(t4, t6); \ + r3 = _mm256_unpacklo_epi32(t5, t7); \ + r4 = _mm256_unpackhi_epi32(t0, t2); \ + r5 = _mm256_unpackhi_epi32(t1, t3); \ + r6 = _mm256_unpackhi_epi32(t4, t6); \ + r7 = _mm256_unpackhi_epi32(t5, t7); \ + t0 = _mm256_unpacklo_epi64(r0, r2); \ + t1 = _mm256_unpackhi_epi64(r0, r2); \ + t2 = _mm256_unpacklo_epi64(r4, r6); \ + t3 = _mm256_unpackhi_epi64(r4, r6); \ + t4 = _mm256_unpacklo_epi64(r1, r3); \ + t5 = _mm256_unpackhi_epi64(r1, r3); \ + t6 = _mm256_unpacklo_epi64(r5, r7); \ + t7 = _mm256_unpackhi_epi64(r5, r7); + +#define STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_storeu_si256((__m256i *)(boffset + x*32), v); + +#define STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_storeu_si256((__m256i *)(boffset + (x + 8)*32), v); + +#define MASK_STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_mask_storeu_epi16(boffset + x*m_load, mmask, v); + +#define MASK_STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_mask_storeu_epi16(boffset + (x + 8)*m_load, mmask, v); + +#define STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { STORE_256_LO(y); } \ + else { STORE_256_HI(y); } \ +} + +#define MASK_STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { MASK_STORE_256_LO(y); } \ + else { MASK_STORE_256_HI(y); } \ +} + +#define SWITCH_STORE_16x(cond, func) \ + switch((cond)) {\ + case 15: func(1, 6); \ + case 14: func(1, 5); \ + case 13: func(1, 4); \ + case 12: func(1, 3); \ + case 11: func(1, 2); \ + case 10: func(1, 1); \ + case 9: func(1, 0); \ + case 8: func(0, 7); \ + case 7: func(0, 6); \ + case 6: func(0, 5); \ + case 5: func(0, 4); \ + case 4: func(0, 3); \ + case 3: func(0, 2); \ + case 2: func(0, 1); \ + case 1: func(0, 0); \ + } + + int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset00, *aoffset01, *aoffset10, *aoffset11; + IFLOAT *boffset0; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + __m256i t00, t01, t02, t03, t04, t05, t06, t07; + __m256i t10, t11, t12, t13, t14, t15, t16, t17; + + aoffset = a; + boffset = b; + BLASLONG n_count = n; + BLASLONG m_count = m; + for (; n_count > 15; n_count -= 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + aoffset += 16; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + // last 16 rows + boffset += 16; + LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset += 31 * 16; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + } + boffset = boffset0 + 16 * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_loadu_si256((__m256i *)(aoffset00)); + _mm256_storeu_si256((__m256i *)(boffset), r0); + boffset += 16; + } + } + if (n_count > 0) { + __mmask16 nmask = (1UL << n_count) - 1; + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + MASK_LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + // last 16 rows + boffset0 = boffset; + boffset += 16; + MASK_LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset = 32 * n_count + boffset0; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_MASK_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_MASK_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_MASK_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_MASK_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + } + boffset = boffset0 + n_count * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aoffset00)); + _mm256_mask_storeu_epi16((__m256i *)(boffset), nmask, r0); + boffset += 16; + } + } return 0; } From 63a103ba6e8a55c4f117f99716d71ef341a03fa1 Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 12 Oct 2021 01:18:37 -0700 Subject: [PATCH 511/681] sbgemm: spr: disable small matrix path by default --- kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 + .../x86_64/sbgemm_small_kernel_permit_spr.c | 42 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 kernel/x86_64/sbgemm_small_kernel_permit_spr.c diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index e061b913d..88f574668 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,5 +1,7 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE +SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c + SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_spr.c b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c new file mode 100644 index 000000000..98d8ca06a --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + return 0; +} From 22bf5c27ba14e94f3b65e54cb17d86d56baab09c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 18 Oct 2021 15:00:19 +0200 Subject: [PATCH 512/681] Add basic support for the Fujitsu A64FX (#3415) * Add initial support for Fujitsu A64FX as generic ARMV8 --- Makefile.arm64 | 9 ++ cpuid_arm64.c | 305 ++++++++++++++++++++------------------ getarch.c | 18 +++ kernel/arm64/KERNEL.A64FX | 198 +++++++++++++++++++++++++ 4 files changed, 386 insertions(+), 144 deletions(-) create mode 100644 kernel/arm64/KERNEL.A64FX diff --git a/Makefile.arm64 b/Makefile.arm64 index 3e3466de8..e9ae23366 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -153,6 +153,15 @@ endif endif endif +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), A64FX) +CCOMMON_OPT += -march=armv8.2-a -mtune=a64fx +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=a64fx +endif +endif +endif + endif endif \ No newline at end of file diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 73a82d188..958e94abc 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -55,6 +55,8 @@ size_t length64=sizeof(value64); #define CPU_EMAG8180 10 // Apple #define CPU_VORTEX 13 +// Fujitsu +#define CPU_A64FX 15 static char *cpuname[] = { "UNKNOWN", @@ -71,7 +73,8 @@ static char *cpuname[] = { "NEOVERSEN1", "THUNDERX3T110", "VORTEX", - "CORTEXA55" + "CORTEXA55", + "A64FX" }; static char *cpuname_lower[] = { @@ -89,7 +92,8 @@ static char *cpuname_lower[] = { "neoversen1", "thunderx3t110", "vortex", - "cortexa55" + "cortexa55", + "a64fx" }; int get_feature(char *search) @@ -185,6 +189,9 @@ int detect(void) // Ampere else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) return CPU_EMAG8180; + // Fujitsu + else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) + return CPU_A64FX; } p = (char *) NULL ; @@ -287,156 +294,166 @@ void get_cpuconfig(void) switch (d) { - case CPU_CORTEXA53: - case CPU_CORTEXA55: - printf("#define %s\n", cpuname[d]); - // Fall-through - case CPU_ARMV8: - // Minimum parameters for ARMv8 (based on A53) - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); + case CPU_CORTEXA53: + case CPU_CORTEXA55: + printf("#define %s\n", cpuname[d]); + // Fall-through + case CPU_ARMV8: + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; - case CPU_CORTEXA57: - case CPU_CORTEXA72: - case CPU_CORTEXA73: + case CPU_CORTEXA57: + case CPU_CORTEXA72: + case CPU_CORTEXA73: // Common minimum settings for these Arm cores // Can change a lot, but we need to be conservative // TODO: detect info from /sys if possible - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 49152\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 3\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - case CPU_NEOVERSEN1: - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 4\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 4\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - - case CPU_FALKOR: - printf("#define FALKOR\n"); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; - - case CPU_THUNDERX: - printf("#define THUNDERX\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 16777216\n"); - printf("#define L2_LINESIZE 128\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; - - case CPU_THUNDERX2T99: - printf("#define THUNDERX2T99 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 49152\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 3\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 2\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_NEOVERSEN1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + + case CPU_THUNDERX: + printf("#define THUNDERX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 16777216\n"); + printf("#define L2_LINESIZE 128\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + + case CPU_THUNDERX2T99: + printf("#define THUNDERX2T99 \n"); + printf("#define L1_CODE_SIZE 32768 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 262144 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 33554432 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; - case CPU_TSV110: - printf("#define TSV110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 4 \n"); - printf("#define L1_DATA_SIZE 65536 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 4 \n"); - printf("#define L2_SIZE 524228 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; - - case CPU_EMAG8180: - // Minimum parameters for ARMv8 (based on A53) - printf("#define EMAG8180\n"); - printf("#define L1_CODE_SIZE 32768\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - - case CPU_THUNDERX3T110: - printf("#define THUNDERX3T110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 524288 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 94371840 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; + + case CPU_EMAG8180: + // Minimum parameters for ARMv8 (based on A53) + printf("#define EMAG8180\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; #ifdef __APPLE__ - case CPU_VORTEX: - printf("#define VORTEX \n"); - sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); - printf("#define L1_CODE_SIZE %lld \n",value64); - sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); - printf("#define L1_CODE_LINESIZE %lld \n",value64); - sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); - printf("#define L1_DATA_SIZE %lld \n",value64); - sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); - printf("#define L2_SIZE %lld \n",value64); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + case CPU_VORTEX: + printf("#define VORTEX \n"); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; #endif + case CPU_A64FX: + printf("#define A64FX\n"); + printf("#define L1_CODE_SIZE 65535\n"); + printf("#define L1_DATA_SIZE 65535\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L2_SIZE 8388608\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } get_cpucount(); } diff --git a/getarch.c b/getarch.c index d095472a6..60bfe05ce 100644 --- a/getarch.c +++ b/getarch.c @@ -1424,6 +1424,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "VORTEX" #endif +#ifdef FORCE_A64FX +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "A64FX" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DA64FX " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "a64fx" +#define CORENAME "A64FX" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX new file mode 100644 index 000000000..c8a53c86b --- /dev/null +++ b/kernel/arm64/KERNEL.A64FX @@ -0,0 +1,198 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) From 6975cbe1f02531ef3ba58b5ea4fdf2150771e2cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Oct 2021 23:23:40 +0200 Subject: [PATCH 513/681] Enable SVE for A64FX --- Makefile.arm64 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index e9ae23366..a07d0892b 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -155,13 +155,13 @@ endif ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq ($(CORE), A64FX) -CCOMMON_OPT += -march=armv8.2-a -mtune=a64fx +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=armv8.2-a -mtune=a64fx +FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx endif endif endif endif -endif \ No newline at end of file +endif From 2845f54eb8ad4c79d0374b52cceac434fc93d8ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Oct 2021 10:50:02 +0200 Subject: [PATCH 514/681] Remove dangerous optimization from previous #3252 - buffer is never unused here --- interface/ztrsv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/interface/ztrsv.c b/interface/ztrsv.c index cf750b0b0..cbb7bba13 100644 --- a/interface/ztrsv.c +++ b/interface/ztrsv.c @@ -199,12 +199,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; - if (incx == 1 && trans == 0 && n < 50) { - buffer = NULL; - (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); - return; - } - IDEBUG_START; FUNCTION_PROFILE_START(); From 4b3769823ac865e66eafe7724f95873cca236751 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 24 Oct 2021 23:57:06 +0200 Subject: [PATCH 515/681] Revert #3252 --- interface/trsv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/interface/trsv.c b/interface/trsv.c index 6a6e8f8ba..a054d8eeb 100644 --- a/interface/trsv.c +++ b/interface/trsv.c @@ -188,12 +188,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (n == 0) return; - if (incx == 1 && trans == 0 && n < 50) { - buffer = NULL; - (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); - return; - } - IDEBUG_START; FUNCTION_PROFILE_START(); From 44950ca173176d696f0c6e952a0b4aa7876dbd36 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 26 Oct 2021 15:19:49 +0200 Subject: [PATCH 516/681] s390x: use DYNAMIC_ARCH's cpu detection for compile-time choice On s390x, the run-time detection for DYNAMIC_ARCH and the compile-time choice in cpuid_zarch use different methods for identifying the supported CPU features. To make cpuid_zarch future-proof and both easier to maintain, switch cpuid_zarch to the same mechanism as DYNAMIC_ZARCH (i.e., derive the supported CPU features from hwcap flags) and share code between both (in a new header cpuid_zarch.h). Signed-off-by: Marius Hillenbrand --- cpuid_zarch.c | 48 +---------------------- cpuid_zarch.h | 67 +++++++++++++++++++++++++++++++++ driver/others/dynamic_zarch.c | 71 +++++++++-------------------------- 3 files changed, 86 insertions(+), 100 deletions(-) create mode 100644 cpuid_zarch.h diff --git a/cpuid_zarch.c b/cpuid_zarch.c index df3b7898f..a6b953dd9 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,57 +27,11 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 -#define CPU_Z15 3 +#include "cpuid_zarch.h" -static char *cpuname[] = { - "ZARCH_GENERIC", - "Z13", - "Z14", - "Z15" -}; - -static char *cpuname_lower[] = { - "zarch_generic", - "z13", - "z14", - "z15" -}; - -int detect(void) -{ - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - if (strstr(p, "3906")) return CPU_Z14; - if (strstr(p, "3907")) return CPU_Z14; - if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 - - return CPU_GENERIC; -} void get_libname(void) { - int d = detect(); printf("%s", cpuname_lower[d]); } diff --git a/cpuid_zarch.h b/cpuid_zarch.h new file mode 100644 index 000000000..404d90e86 --- /dev/null +++ b/cpuid_zarch.h @@ -0,0 +1,67 @@ +#include + +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 +#define CPU_Z15 3 + +static char *cpuname[] = { + "ZARCH_GENERIC", + "Z13", + "Z14", + "Z15" +}; + +static char *cpuname_lower[] = { + "zarch_generic", + "z13", + "z14", + "z15" +}; + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +static int detect(void) +{ + unsigned long hwcap = get_hwcap(); + + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return CPU_Z14; + + if (hwcap & HWCAP_S390_VX) + return CPU_Z13; + + return CPU_GENERIC; +} + diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index bf5eab9b2..ad748f14c 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,38 +1,7 @@ #include "common.h" +#include "cpuid_zarch.h" #include -// Guard the use of getauxval() on glibc version >= 2.16 -#ifdef __GLIBC__ -#include -#if __GLIBC_PREREQ(2, 16) -#include -#define HAVE_GETAUXVAL 1 - -static unsigned long get_hwcap(void) -{ - unsigned long hwcap = getauxval(AT_HWCAP); - char *maskenv; - - // honor requests for not using specific CPU features in LD_HWCAP_MASK - maskenv = getenv("LD_HWCAP_MASK"); - if (maskenv) - hwcap &= strtoul(maskenv, NULL, 0); - - return hwcap; - // note that a missing auxval is interpreted as no capabilities - // available, which is safe. -} - -#else // __GLIBC_PREREQ(2, 16) -#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" - -static unsigned long get_hwcap(void) { - // treat missing support for getauxval() as no capabilities available, - // which is safe. - return 0; -} -#endif // __GLIBC_PREREQ(2, 16) -#endif // __GLIBC extern gotoblas_t gotoblas_ZARCH_GENERIC; #ifdef DYN_Z13 @@ -46,23 +15,16 @@ extern gotoblas_t gotoblas_Z14; extern void openblas_warning(int verbose, const char* msg); -static char* corename[] = { - "unknown", - "Z13", - "Z14", - "ZARCH_GENERIC", -}; - char* gotoblas_corename(void) { #ifdef DYN_Z13 - if (gotoblas == &gotoblas_Z13) return corename[1]; + if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; #endif #ifdef DYN_Z14 - if (gotoblas == &gotoblas_Z14) return corename[2]; + if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; #endif - if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; - return corename[0]; + return "unknown"; } #ifndef HWCAP_S390_VXE @@ -79,25 +41,28 @@ char* gotoblas_corename(void) { */ static gotoblas_t* get_coretype(void) { - unsigned long hwcap __attribute__((unused)) = get_hwcap(); + int cpu = detect(); -#ifdef DYN_Z14 + switch(cpu) { // z14 and z15 systems: exploit Vector Facility (SIMD) and // Vector-Enhancements Facility 1 (float SIMD instructions), if present. - if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + case CPU_Z14: +#ifdef DYN_Z14 return &gotoblas_Z14; #endif -#ifdef DYN_Z13 // z13: Vector Facility (SIMD for double) - if (hwcap & HWCAP_S390_VX) + case CPU_Z13: +#ifdef DYN_Z13 return &gotoblas_Z13; #endif + default: // fallback in case of missing compiler support, systems before z13, or // when the OS does not advertise support for the Vector Facility (e.g., // missing support in the OS kernel) - return &gotoblas_ZARCH_GENERIC; + return &gotoblas_ZARCH_GENERIC; + } } static gotoblas_t* force_coretype(char* coretype) { @@ -108,28 +73,28 @@ static gotoblas_t* force_coretype(char* coretype) { for (i = 0; i < NUM_CORETYPES; i++) { - if (!strncasecmp(coretype, corename[i], 20)) + if (!strncasecmp(coretype, cpuname[i], 20)) { found = i; break; } } - if (found == 1) { + if (found == CPU_Z13) { #ifdef DYN_Z13 return &gotoblas_Z13; #else openblas_warning(1, "Z13 support not compiled in"); return NULL; #endif - } else if (found == 2) { + } else if (found == CPU_Z14) { #ifdef DYN_Z14 return &gotoblas_Z14; #else openblas_warning(1, "Z14 support not compiled in"); return NULL; #endif - } else if (found == 3) { + } else if (found == CPU_GENERIC) { return &gotoblas_ZARCH_GENERIC; } From ead476025df7cd160e9941364e90fe5b5d1c5113 Mon Sep 17 00:00:00 2001 From: Neutron3529 Date: Wed, 27 Oct 2021 14:16:37 +0800 Subject: [PATCH 517/681] auto-detect for Intel i7-11800H --- cpuid_x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index bb9d779bd..874acea80 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1459,6 +1459,7 @@ int get_cpuname(void){ case 8: switch (model) { case 12: // Tiger Lake + case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) @@ -2196,7 +2197,7 @@ int get_coretype(void){ break; case 9: case 8: - if (model == 12) { // Tiger Lake + if (model == 12 || model == 13) { // Tiger Lake if(support_avx512()) return CORE_SKYLAKEX; if(support_avx2()) From 1a10d3e09ded85be92a7b4860113c864d16e8172 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 27 Oct 2021 16:37:18 +0200 Subject: [PATCH 518/681] add sve dgemm prototype --- kernel/arm64/dgemm_kernel_sve_v1x8.S | 851 +++++++++++++++++++++++++++ 1 file changed, 851 insertions(+) create mode 100644 kernel/arm64/dgemm_kernel_sve_v1x8.S diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..c2bbbee25 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -0,0 +1,851 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaZ z10.d +#define alphaV0 v10.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 pA0_2 +//v03 pA0_3 +//v04 pA0_4 +//v05 pA0_5 +//v06 pA0_6 +//v07 pA0_7 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one + //incb pA, all, mul #2 + add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + +.endm + +.macro SAVEv1x1 + dup alphaZ, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + /* mov counterI, origM */ + /* asr counterI, counterI, #3 // counterI = counterI / 8 */ + /* cmp counterI, #0 */ + /* ble .Ldgemm_kernel_L4_M4_BEGIN */ + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + blt .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp x18, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 22a616bd8f9d89894fcb35dfd90414fea45e36c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Oct 2021 22:17:58 +0200 Subject: [PATCH 519/681] Add model number for Tiger Lake H (mobile variant) --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 071788a9b..9fa3d2dc2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -663,7 +663,7 @@ static gotoblas_t *get_coretype(void){ return NULL; case 9: case 8: - if (model == 12) { // Tiger Lake + if (model == 12 || model == 13) { // Tiger Lake if (support_avx512()) return &gotoblas_SKYLAKEX; if(support_avx2()){ From 77747bc5361880a9c5df62131296a9c0d26cfb75 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 27 Oct 2021 17:26:28 +0200 Subject: [PATCH 520/681] cpuid_zarch/hwcaps: add documentation and dump hwcaps in init Add pointers to the definition of the hardware capability flags in glibc and describe how they relate to the levels CPU_Z13 and CPU_Z14 for optimized kernels. To aid identifying available hardware capabilities and in debugging potential build issues, dump their value in dynamic_arch_init() when OPENBLAS_VERBOSE is set to 2 or higher. Signed-off-by: Marius Hillenbrand --- cpuid_zarch.h | 34 ++++++++++++++++++++++++++++++++++ driver/others/dynamic_zarch.c | 14 +++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/cpuid_zarch.h b/cpuid_zarch.h index 404d90e86..686f2eb17 100644 --- a/cpuid_zarch.h +++ b/cpuid_zarch.h @@ -56,6 +56,40 @@ static int detect(void) { unsigned long hwcap = get_hwcap(); + // Choose the architecture level for optimized kernels based on hardware + // capability bits (just like glibc chooses optimized implementations). + // + // The hardware capability bits that are used here indicate both + // hardware support for a particular ISA extension and the presence of + // software support to enable its use. For example, when HWCAP_S390_VX + // is set then both the CPU can execute SIMD instructions and the Linux + // kernel can manage applications using the vector registers and SIMD + // instructions. + // + // See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in + // sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware + // capability bits. They are derived from the information that the + // "store facility list (extended)" instructions provide. + // (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) + // + // currently used: + // HWCAP_S390_VX - vector facility for z/Architecture (introduced with + // IBM z13), enables level CPU_Z13 (SIMD) + // HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM + // z14), together with VX enables level CPU_Z14 + // (single-precision SIMD instructions) + // + // When you add optimized kernels that make use of other ISA extensions + // (e.g., for exploiting the vector-enhancements facility 2 that was introduced + // with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate + // it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 + // for the z15 vector enhancements). + // + // To learn the value of hwcaps on a given system, set the environment + // variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running + // LD_SHOW_AUXV=1 /bin/true). + // Also, the init function for dynamic arch support will print hwcaps + // when OPENBLAS_VERBOSE is set to 2 or higher. if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) return CPU_Z14; diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index ad748f14c..5b45aae2f 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -13,6 +13,7 @@ extern gotoblas_t gotoblas_Z14; #define NUM_CORETYPES 4 +extern int openblas_verbose(); extern void openblas_warning(int verbose, const char* msg); char* gotoblas_corename(void) { @@ -120,6 +121,11 @@ void gotoblas_dynamic_init(void) { else { gotoblas = get_coretype(); + if (openblas_verbose() >= 2) { + snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", + getauxval(AT_HWCAP)); + openblas_warning(2, coremsg); + } } if (gotoblas == NULL) @@ -130,9 +136,11 @@ void gotoblas_dynamic_init(void) { } if (gotoblas && gotoblas->init) { - strncpy(coren, gotoblas_corename(), 20); - sprintf(coremsg, "Core: %s\n", coren); - openblas_warning(2, coremsg); + if (openblas_verbose() >= 2) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + } gotoblas->init(); } else { From bb01e26cfea39d0952dcc467a6e399c534e7cf66 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 29 Oct 2021 16:39:03 +0200 Subject: [PATCH 521/681] Adjust compiler options for nvidia hpc 21.9 (and fix a long-standing typo in dynamic_arch settings) --- Makefile.system | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 833511fad..e76c04106 100644 --- a/Makefile.system +++ b/Makefile.system @@ -104,7 +104,7 @@ GETARCH_FLAGS += -DUSER_TARGET ifeq ($(TARGET), GENERIC) ifeq ($(DYNAMIC_ARCH), 1) override NO_EXPRECISION=1 -export NO_EXPRECiSION +export NO_EXPRECISION endif endif endif @@ -905,10 +905,21 @@ PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0- PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) NEWPGI := 1 +PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) +PGCVERSIONGTEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 21) +PGCMINORVERSIONGE9 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` ==9) +PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE9) +ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 110 111 011)) +NEWPGI2 := 1 +endif endif ifdef BINARY64 ifeq ($(ARCH), x86_64) +ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7-64 +else +CCOMMON_OPT += -tp px +endif ifneq ($(NEWPGI),1) CCOMMON_OPT += -D__MMX__ -Mnollvm endif @@ -923,7 +934,11 @@ endif endif endif else +ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7 +else +CCOMMON_OPT += -tp px +endif endif endif From 9874cd11cb3c9390ac69f14130a368921d7a37ac Mon Sep 17 00:00:00 2001 From: Mehdi Chinoune Date: Fri, 29 Oct 2021 21:28:21 +0100 Subject: [PATCH 522/681] Fix exported OpenBLASTargets.cmake When both BUILD_SHARED_LIBS and BUILD_STATIC_LIBS are enabled, cmake export both of them to OpenBLASTargets under tha same name `OpenBLAS::OpenBLAS` which leads to fatal error about OpenBLAS::OpenBLAS being both static and shared target. This change makes cmake export only the shared library in that case. There is another solution to treat them as components, but I am afraid that will make it backward incompatible. --- CMakeLists.txt | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 656cc36f0..cdbb8c306 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -386,11 +386,22 @@ endif() # Install project # Install libraries -install(TARGETS ${OpenBLAS_LIBS} - EXPORT "OpenBLAS${SUFFIX64}Targets" - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) + install(TARGETS ${OpenBLAS_LIBNAME}_shared + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + install(TARGETS ${OpenBLAS_LIBNAME}_static + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +else() + install(TARGETS ${OpenBLAS_LIBS} + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +endif() # Install headers set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) From 746b4f0f175051b67bfdd5197542871c23790e4b Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 30 Oct 2021 12:11:44 +0200 Subject: [PATCH 523/681] added SVE ncopy and tcopy --- kernel/arm64/dgemm_ncopy_sve_v1.c | 79 +++++++++++++++++++++++++++++++ kernel/arm64/dgemm_tcopy_sve_v1.c | 78 ++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/arm64/dgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/dgemm_tcopy_sve_v1.c diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..342812107 --- /dev/null +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..21bfdf3db --- /dev/null +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From a6fd497820510d9351b60d5d8b9b4347c92a7c97 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 30 Oct 2021 17:31:19 +0200 Subject: [PATCH 524/681] Fix nvidia HPC version checks --- Makefile.system | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Makefile.system b/Makefile.system index e76c04106..a97cca70c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -900,16 +900,15 @@ endif ifeq ($(C_COMPILER), PGI) PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) -PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) -PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) +PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) -ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) NEWPGI := 1 PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) -PGCVERSIONGTEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 21) -PGCMINORVERSIONGE9 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` ==9) -PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE9) -ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 110 111 011)) +PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) +PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) NEWPGI2 := 1 endif endif From a8fbdbac34f61c06a212876c07e89fb02b1c9dad Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 31 Oct 2021 10:24:25 +0100 Subject: [PATCH 525/681] fix sve dgemm kernel + sve dtrmm --- kernel/arm64/dgemm_kernel_sve_v1x8.S | 140 ++-- kernel/arm64/dtrmm_kernel_sve_v1x8.S | 1007 ++++++++++++++++++++++++++ 2 files changed, 1088 insertions(+), 59 deletions(-) create mode 100644 kernel/arm64/dtrmm_kernel_sve_v1x8.S diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S index c2bbbee25..94682aea9 100644 --- a/kernel/arm64/dgemm_kernel_sve_v1x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pCRow3 x15 + +#define lanes x15 #define pA x16 #define alpha x17 #define alpha0 d10 -#define alphaZ z10.d -#define alphaV0 v10.d[0] +#define alphaZ z2.d #define A_PRE_SIZE 2560 -#define B_PRE_SIZE 448 +#define B_PRE_SIZE 512 #define C_PRE_SIZE 128 // 00 origM @@ -73,9 +73,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pCRow3 +// 15 lanes // 16 pA -// 17 +// 17 // 18 must save // 19 must save // 20 must save @@ -93,20 +93,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v00 ALPHA -> pA0_0 //v01 pA0_1 -//v02 pA0_2 -//v03 pA0_3 -//v04 pA0_4 -//v05 pA0_5 -//v06 pA0_6 -//v07 pA0_7 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 //v08 must save pB0_0 //v09 must save pB0_1 -//v10 must save pB0_2 --> ALPHA0 +//v10 must save pB0_2 //v11 must save pB0_3 -//v12 must save pB1_0 -//v13 must save pB1_1 -//v14 must save pB1_2 -//v15 must save pB1_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 //v16 must save C0 //v17 must save C1 //v18 must save C2 @@ -133,9 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] - ld1d z1.d, p1/z, [pA, x18, lsl #3] // next one + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one //incb pA, all, mul #2 - add pA, pA, x18, lsl #4 // pA = pA + cnt_active * 2 * 8 + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -157,12 +157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -170,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_M1 ld1d z1.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 fmla z16.d, p1/m, z0.d, z8.d ld1rd z8.d, p0/z, [pB] @@ -181,12 +183,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z19.d, p1/m, z0.d, z11.d ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] ld1rd z12.d, p0/z, [pB, 32] fmla z21.d, p1/m, z0.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z0.d, z14.d ld1rd z14.d, p0/z, [pB, 48] fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 @@ -194,7 +198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_M2 ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 fmla z16.d, p1/m, z1.d, z8.d ld1rd z8.d, p0/z, [pB] @@ -206,6 +210,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z11.d, p0/z, [pB, 24] fmla z20.d, p1/m, z1.d, z12.d ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d ld1rd z13.d, p0/z, [pB, 40] fmla z22.d, p1/m, z1.d, z14.d @@ -222,6 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z18.d, p1/m, z1.d, z10.d fmla z19.d, p1/m, z1.d, z11.d fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z21.d, p1/m, z1.d, z13.d fmla z22.d, p1/m, z1.d, z14.d fmla z23.d, p1/m, z1.d, z15.d @@ -229,7 +235,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -245,16 +251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z19.d, p1/m, z0.d, z11.d fmla z20.d, p1/m, z0.d, z12.d fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] fmla z22.d, p1/m, z0.d, z14.d fmla z23.d, p1/m, z0.d, z15.d .endm .macro SAVEv1x8 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -262,43 +269,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z28.d, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaZ st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z29.d, p1/z, [pCRow1] fmla z29.d, p1/m, z21.d, alphaZ st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z30.d, p1/z, [pCRow2] fmla z30.d, p1/m, z22.d, alphaZ st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z31.d, p1/z, [pCRow1] fmla z31.d, p1/m, z23.d, alphaZ st1d z31.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -313,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -324,13 +337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z16.d, p1/m, z0.d, z8.d fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z18.d, p1/m, z0.d, z10.d fmla z19.d, p1/m, z0.d, z11.d .endm .macro SAVEv1x4 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -338,23 +351,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC ld1d z26.d, p1/z, [pCRow2] fmla z26.d, p1/m, z18.d, alphaZ st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z27.d, p1/z, [pCRow1] fmla z27.d, p1/m, z19.d, alphaZ st1d z27.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -367,7 +382,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x2_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -375,12 +390,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 16 fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] fmla z17.d, p1/m, z0.d, z9.d .endm .macro SAVEv1x2 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] @@ -388,13 +403,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow2, pCRow1, LDC ld1d z25.d, p1/z, [pCRow1] fmla z25.d, p1/m, z17.d, alphaZ st1d z25.d, p1, [pCRow1] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -406,28 +421,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x1_SUB ld1d z0.d, p1/z, [pA] - add pA, pA, x18, lsl #3 // pA = pA + cnt_active * 8 + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 ld1rd z8.d, p0/z, [pB] add pB, pB, 8 fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x1 - dup alphaZ, alpha prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - add pCRow1, pCRow0, LDC ld1d z24.d, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaZ st1d z24.d, p1, [pCRow0] - add pCRow0, pCRow0, x18, lsl #3 // pC = pC + cnt_active * 8 + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 .endm @@ -456,6 +470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPA] fmov alpha, d0 + dup alphaZ, alpha lsl LDC, LDC, #3 // ldc = ldc * 8 ptrue p0.d // create true predicate @@ -473,7 +488,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_BEGIN: mov pCRow0, pC - add pC, pCRow0, LDC, lsl #3 // add 8 x LDC + add pC, pC, LDC, lsl #3 // add 8 x LDC mov pA, origPA // pA = start of A array @@ -481,11 +496,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d - /* mov counterI, origM */ - /* asr counterI, counterI, #3 // counterI = counterI / 8 */ - /* cmp counterI, #0 */ - /* ble .Ldgemm_kernel_L4_M4_BEGIN */ + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 .Ldgemm_kernel_L8_Mv1_20: @@ -584,7 +595,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension b.any .Ldgemm_kernel_L8_Mv1_20 .Ldgemm_kernel_L8_END: @@ -608,7 +619,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #2 // add 4 x LDC + add pC, pC, LDC, lsl #2 // add 4 x LDC mov pA, origPA // pA = start of A array @@ -616,7 +627,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L4_Mv1_20: @@ -626,17 +637,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asr counterL , origK, #3 // L = K / 8 cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L4_Mv1_44 + ble .Ldgemm_kernel_L4_Mv1_44 .align 5 .Ldgemm_kernel_L4_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB KERNELv1x4_SUB @@ -651,6 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L4_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x4_SUB subs counterL, counterL, #1 @@ -667,12 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L4_Mv1_20 .Ldgemm_kernel_L4_END: - add origPB, origPB, origK, lsl #5 // B = B + K * 4 * 8 + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ /******************************************************************************/ @@ -686,7 +703,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + add pC, pC, LDC, lsl #1 // add 2 x LDC mov pA, origPA // pA = start of A array @@ -694,7 +711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L2_Mv1_20: @@ -704,15 +721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asr counterL , origK, #3 // L = K / 8 cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L2_Mv1_44 + ble .Ldgemm_kernel_L2_Mv1_44 .align 5 .Ldgemm_kernel_L2_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB KERNELv1x2_SUB KERNELv1x2_SUB @@ -729,6 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L2_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x2_SUB subs counterL, counterL, #1 @@ -745,7 +765,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L2_Mv1_20 @@ -764,7 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pCRow0, pC - add pC, pCRow0, LDC, lsl #1 // add 2 x LDC + add pC, pC, LDC // add 1 x LDC mov pA, origPA // pA = start of A array @@ -772,7 +792,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterI, #0 whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d .align 5 .Ldgemm_kernel_L1_Mv1_20: @@ -781,12 +801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INITv1x1 // fill with zeros asr counterL , origK, #3 // L = K / 8 - cmp counterL , #0 // is there at least 4 to do? - blt .Ldgemm_kernel_L1_Mv1_44 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 .align 5 .Ldgemm_kernel_L1_Mv1_22: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB KERNELv1x1_SUB KERNELv1x1_SUB @@ -807,10 +828,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 .Ldgemm_kernel_L1_Mv1_46: + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNELv1x1_SUB subs counterL, counterL, #1 - bne .Ldgemm_kernel_L1_Mv1_46 + bgt .Ldgemm_kernel_L1_Mv1_46 .Ldgemm_kernel_L1_Mv1_100: prfm PLDL1KEEP, [pA] @@ -823,7 +845,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. incd counterI whilelt p1.d, counterI, origM //SVE instruction - cntp x18, p0, p1.d + cntp lanes, p0, p1.d b.any .Ldgemm_kernel_L1_Mv1_20 diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..458090411 --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -0,0 +1,1007 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + //incb pA, all, mul #2 + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.d, p1/m, z20.d, alphaZ + st1d z20.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.d, p1/m, z21.d, alphaZ + st1d z21.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.d, p1/m, z22.d, alphaZ + st1d z22.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.d, p1/m, z23.d, alphaZ + st1d z23.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmla z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmla z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmla z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmla z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmla z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldtrmm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L8_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldtrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldtrmm_kernel_L8_Mv1_22a + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L8_Mv1_22 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldtrmm_kernel_L8_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldtrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldtrmm_kernel_L8_Mv1_44 + +.Ldtrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldtrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L8_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L8_Mv1_46 + +.Ldtrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L8_Mv1_20 + +.Ldtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Ldtrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldtrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L4_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L4_Mv1_22 + +.Ldtrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L4_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L4_Mv1_46 + +.Ldtrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L4_Mv1_20 + + +.Ldtrmm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldtrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L2_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L2_Mv1_22 + +.Ldtrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L2_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L2_Mv1_46 + +.Ldtrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Ldtrmm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L2_Mv1_20 + + +.Ldtrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldtrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldtrmm_kernel_L1_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_22 + +.Ldtrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L1_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_46 + +.Ldtrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Ldtrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L1_Mv1_20 + + +.Ldtrmm_kernel_L1_END: + +/******************************************************************************/ + +.Ldtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 25f99fa9f80641cf50eca38ab56cdc4dd99684cd Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 1 Nov 2021 20:15:42 +0800 Subject: [PATCH 526/681] Add cblas_{c/z}srot cblas_{c/z}rotg support --- interface/zrot.c | 8 +++++++- interface/zrotg.c | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/interface/zrot.c b/interface/zrot.c index 1c45f685b..228c5ee45 100644 --- a/interface/zrot.c +++ b/interface/zrot.c @@ -42,14 +42,20 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ - BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT c = *C; FLOAT s = *S; +#else +void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) { + FLOAT *x = (FLOAT*) VX; + FLOAT *y = (FLOAT*) VY; +#endif /* CBLAS */ + PRINT_DEBUG_NAME; if (n <= 0) return; diff --git a/interface/zrotg.c b/interface/zrotg.c index bc4f06492..123f4da85 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -4,8 +4,16 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ +#else +void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { + FLOAT *DA = (FLOAT*) VDA; + FLOAT *DB = (FLOAT*) VDB; + FLOAT *S = (FLOAT*) VS; +#endif /* CBLAS */ + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); From 7093372e3283e221e6598bc7ed93abf5f8e8a523 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 1 Nov 2021 22:53:21 +0100 Subject: [PATCH 527/681] add ARMV8SVE target --- Makefile.arm64 | 7 ++ getarch.c | 14 +++ kernel/arm64/KERNEL.ARMV8SVE | 183 ++++++++++++++++++++++++++++++ kernel/arm64/dgemm_tcopy_sve_v1.c | 1 - param.h | 30 +++++ 5 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/KERNEL.ARMV8SVE diff --git a/Makefile.arm64 b/Makefile.arm64 index a07d0892b..801601030 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -20,6 +20,13 @@ FCOMMON_OPT += -march=armv8-a endif endif +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -march=armv8-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a+sve +endif +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/getarch.c b/getarch.c index 60bfe05ce..7ae7591c5 100644 --- a/getarch.c +++ b/getarch.c @@ -1198,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8SVE +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8SVE" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8SVE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "armv8sve" +#define CORENAME "ARMV8SVE" +#endif + #ifdef FORCE_ARMV8 #define FORCE diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE new file mode 100644 index 000000000..572c96fac --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -0,0 +1,183 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c index 21bfdf3db..33e69bf0c 100644 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -46,7 +46,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG j; IFLOAT *aoffset, *aoffset1, *boffset; - svint64_t lda_vec = svindex_s64(0LL, lda); uint64_t sve_size = svcntd(); aoffset = a; diff --git a/param.h b/param.h index 23f406d74..8c2061931 100644 --- a/param.h +++ b/param.h @@ -3294,6 +3294,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 @@ -3325,6 +3354,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #endif /* Cores */ + #endif /* ARMv8 */ #if defined(ARMV5) From f119e2635487bc9f7cbd113042084e39d311f68e Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 3 Nov 2021 12:45:09 +0100 Subject: [PATCH 528/681] Fix flipped indices in benchmark for gemv Fixes #3439 --- benchmark/gemv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/gemv.c b/benchmark/gemv.c index a0001277a..fc39f3f3d 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } From efb16fafb0148c6e2c1dbe2b482ef394e262281c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Nov 2021 12:11:16 +0100 Subject: [PATCH 529/681] Fix miscounting of threadpool size on Linux with OMP_PROC_BIND=TRUE (#3437) * return OMP places (if available, or SC_NPROCESSORS_CONF) for maximum thread count when built with OpenMP --- driver/others/memory.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 0185fa683..8e601099c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -246,6 +246,14 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +#if _OPENMP >= 201511 + nums = omp_get_num_places(); +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif @@ -1806,10 +1814,19 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +/* if (omp_get_proc_bind() != omp_proc_bind_false) */ +#if _OPENMP >= 201511 + nums = omp_get_num_places(); +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif - + #if !defined(__GLIBC_PREREQ) return nums; #else From faae86fba2c13ce617cc1a3f0351221643b4b438 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Nov 2021 20:35:41 +0100 Subject: [PATCH 530/681] Add CPUIDs for Alder Lake and some other recent Intel cpus --- cpuid_x86.c | 180 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 134 insertions(+), 46 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 874acea80..72e95214e 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1,3 +1,4 @@ +//{ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -1455,7 +1456,6 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 9: case 8: switch (model) { case 12: // Tiger Lake @@ -1475,30 +1475,70 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; - } - case 10: //family 6 exmodel 10 + case 15: // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + case 9: switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U + case 7: // Alder Lake desktop + case 10: // Alder Lake mobile if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CPUTYPE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; - case 7: // Rocket Lake - if(support_avx512()) + return CPUTYPE_NEHALEM; + case 13: // Ice Lake NNPI + if(support_avx512()) return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) return CPUTYPE_SANDYBRIDGE; - else - return CPUTYPE_NEHALEM; - } - break; - } + else + return CPUTYPE_NEHALEM; + } break; + case 10: //family 6 exmodel 10 + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + } + break; case 0x7: return CPUTYPE_ITANIUM; case 0xf: @@ -2069,32 +2109,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; - case 10: - switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U - if(support_avx()) - #ifndef NO_AVX2 - return CORE_HASWELL; - #else - return CORE_SANDYBRIDGE; - #endif - else - return CORE_NEHALEM; - case 7:// Rocket Lake -#ifndef NO_AVX512 - if(support_avx512()) - return CORE_SKYLAKEX; -#endif -#ifndef NO_AVX2 - if(support_avx2()) - return CORE_HASWELL; -#endif - if(support_avx()) - return CORE_SANDYBRIDGE; - else - return CORE_NEHALEM; - } + case 5: switch (model) { case 6: @@ -2148,6 +2163,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: if (model == 6) #ifndef NO_AVX512 @@ -2162,7 +2178,7 @@ int get_coretype(void){ else return CORE_NEHALEM; #endif - if (model == 10) + if (model == 10 || model == 12) #ifndef NO_AVX512 if(support_avx512_bf16()) return CORE_COOPERLAKE; @@ -2178,10 +2194,11 @@ int get_coretype(void){ return CORE_NEHALEM; #endif break; + case 7: if (model == 10) return CORE_NEHALEM; - if (model == 14) + if (model == 13 || model == 14) // Ice Lake #ifndef NO_AVX512 return CORE_SKYLAKEX; #else @@ -2195,9 +2212,9 @@ int get_coretype(void){ return CORE_NEHALEM; #endif break; - case 9: + case 8: - if (model == 12 || model == 13) { // Tiger Lake + if (model == 12 || model == 13) { // Tiger Lake if(support_avx512()) return CORE_SKYLAKEX; if(support_avx2()) @@ -2207,7 +2224,50 @@ int get_coretype(void){ else return CORE_NEHALEM; } - if (model == 14) { // Kaby Lake + if (model == 14) { // Kaby Lake mobile + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + } + if (model == 15) { // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 13) { // Ice Lake NNPI + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 14) { // Kaby Lake desktop if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; @@ -2217,12 +2277,39 @@ int get_coretype(void){ else return CORE_NEHALEM; } - } break; + case 10: + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; + } } } @@ -2495,3 +2582,4 @@ void get_sse(void){ if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } +//} \ No newline at end of file From 08f8bb66c0bf13d28eefac27bfdc7bb7e0a3b46f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 4 Nov 2021 20:36:39 +0100 Subject: [PATCH 531/681] Add CPUIDs for Alder Lake and other recent Intel cpus --- driver/others/dynamic.c | 51 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 9fa3d2dc2..653915aab 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -624,7 +624,7 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } - if (model == 10) { + if (model == 10 || model == 12){ // Ice Lake SP if(support_avx512_bf16()) return &gotoblas_COOPERLAKE; @@ -639,12 +639,12 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; } - } + } return NULL; case 7: if (model == 10) // Goldmont Plus return &gotoblas_NEHALEM; - if (model == 14) { + if (model == 13 || model == 14) { // Ice Lake if (support_avx512()) return &gotoblas_SKYLAKEX; @@ -661,7 +661,6 @@ static gotoblas_t *get_coretype(void){ } } return NULL; - case 9: case 8: if (model == 12 || model == 13) { // Tiger Lake if (support_avx512()) @@ -689,6 +688,50 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + if (model == 15){ // Sapphire Rapids + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + + + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + return NULL; case 10: if (model == 5 || model == 6) { if(support_avx2()) From 4f057bffd635f2f1d5d17397c59158360a880787 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 5 Nov 2021 10:43:17 +0100 Subject: [PATCH 532/681] Fix NULL pointer checks in blas_memory_alloc --- driver/others/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 8e601099c..0a0b0eb3d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2921,7 +2921,7 @@ void *blas_memory_alloc(int procpos){ func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -3032,7 +3032,7 @@ allocation2: func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); From ab7917910d05c9d55f7511e440c0b0e4178f4511 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 7 Nov 2021 20:37:51 +0100 Subject: [PATCH 533/681] add v2x8 kernel + fix sve dtrmm --- kernel/arm64/KERNEL.A64FX | 28 +- kernel/arm64/dgemm_kernel_sve_v2x8.S | 1665 ++++++++++++++++++++++++++ kernel/arm64/dtrmm_kernel_sve_v1x8.S | 14 +- param.h | 4 +- 4 files changed, 1682 insertions(+), 29 deletions(-) create mode 100644 kernel/arm64/dgemm_kernel_sve_v2x8.S diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index c8a53c86b..4c2921e03 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -143,34 +143,22 @@ endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + + CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..59e41559f --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -0,0 +1,1665 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha x18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 d10 +#define alphaZ z7.d + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 +//v03 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 + dup z24.d, #0 + dup z25.d, #0 + dup z26.d, #0 + dup z27.d, #0 + dup z28.d, #0 + dup z29.d, #0 + dup z30.d, #0 + dup z31.d, #0 +.endm + +.macro KERNELv2x8_I + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + ld1d z2.d, p0/z, [pA1, vec_len, lsl #3] + ld1d z3.d, p0/z, [pA2, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M1 + ld1d z2.d, p0/z, [pA1] + ld1d z3.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M2 + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_E + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d +.endm + +.macro KERNELv2x8_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + fmla z24.d, p0/m, z0.d, z12.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.d, p0/m, z1.d, z12.d + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z8.d, p0/z, [pCRow2] + ld1d z9.d, p0/z, [pCRow2, #1, mul vl] + fmla z8.d, p0/m, z24.d, alphaZ + fmla z9.d, p0/m, z25.d, alphaZ + st1d z8.d, p0, [pCRow2] + st1d z9.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z26.d, alphaZ + fmla z11.d, p0/m, z27.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z28.d, alphaZ + fmla z13.d, p0/m, z29.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z30.d, alphaZ + fmla z15.d, p0/m, z31.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv2x4_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv2x2_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 +.endm + +.macro INITv2x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv2x1_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA1] + ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one + //incb pA1, all, mul #2 + add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + cntd vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Ldgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv2_22a + + .align 5 +.Ldgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv2_22 + + .align 5 +.Ldgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Ldgemm_kernel_L8_Mv2_44 + + .align 5 +.Ldgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Ldgemm_kernel_L8_Mv2_44 + +.Ldgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Ldgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv2_100 + + .align 5 +.Ldgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv2_46 + +.Ldgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L8_END + +////////////////////////////////// +.Ldgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv2_44 + + .align 5 +.Ldgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv2_22 + +.Ldgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv2_100 + + .align 5 +.Ldgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv2_46 + +.Ldgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L4_END + +////////////////////////////////// +.Ldgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv2_44 + + .align 5 +.Ldgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv2_22 + +.Ldgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv2_100 + + .align 5 +.Ldgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv2_46 + +.Ldgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L2_END + + +////////////////////////////////// +.Ldgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/******************************************************************************/ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + + .align 5 +.Ldgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv2_44 + + .align 5 +.Ldgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_22 + +.Ldgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv2_100 + + .align 5 +.Ldgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_46 + +.Ldgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L1_END + + +////////////////////////////////// +.Ldgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S index 458090411..1d4df08fb 100644 --- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -344,21 +344,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] add pCRow2, pCRow1, LDC - fmla z17.d, p1/m, z17.d, alphaZ + fmul z17.d, p1/m, z17.d, alphaZ st1d z17.d, p1, [pCRow1] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] add pCRow1, pCRow2, LDC - fmla z18.d, p1/m, z18.d, alphaZ + fmul z18.d, p1/m, z18.d, alphaZ st1d z18.d, p1, [pCRow2] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - fmla z19.d, p1/m, z19.d, alphaZ + fmul z19.d, p1/m, z19.d, alphaZ st1d z19.d, p1, [pCRow1] add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 @@ -392,11 +392,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] add pCRow1, pCRow0, LDC - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - fmla z17.d, p1/m, z17.d, alphaZ + fmul z17.d, p1/m, z17.d, alphaZ st1d z17.d, p1, [pCRow1] add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 @@ -426,7 +426,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - fmla z16.d, p1/m, z16.d, alphaZ + fmul z16.d, p1/m, z16.d, alphaZ st1d z16.d, p1, [pCRow0] diff --git a/param.h b/param.h index 8c2061931..ad0cecda7 100644 --- a/param.h +++ b/param.h @@ -3328,8 +3328,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 From 9cc0098ce2886644e37ccf044e05940485cd3f83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 10 Nov 2021 22:27:14 +0100 Subject: [PATCH 534/681] Fix potentially wrong HOSTARCH definition in cross-compilation --- Makefile.system | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index a97cca70c..16e8fcbd6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,11 +9,10 @@ ifndef TOPDIR TOPDIR = . endif - # If ARCH is not set, we use the host system's architecture for getarch compile options. -ifndef ARCH +# we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) -else -HOSTARCH = $(ARCH) +ifeq ($(HOSTARCH), amd64) +HOSTARCH=x86_64 endif HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) From 7d996b1c365f43fe37fd2127d95c2a82d76f3e2e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 13 Nov 2021 18:48:53 +0100 Subject: [PATCH 535/681] dtrmm_utcopy sve function --- kernel/arm64/trmm_utcopy_sve_v1.c | 128 ++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 kernel/arm64/trmm_utcopy_sve_v1.c diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c new file mode 100644 index 000000000..e44e67373 --- /dev/null +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, j; + BLASLONG X; + + int sve_len = svcntd(); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + (posY + j) * lda; + } else { + ao = a + posY + (posX + j) * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} From d6194d6a0c437354d13de4ab09a9851ac816fcd0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Nov 2021 23:25:34 +0100 Subject: [PATCH 536/681] get MSA capability from feature flags --- cpuid_mips.c | 36 ++++++++++++++++++++++++++++++++++++ cpuid_mips64.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/cpuid_mips.c b/cpuid_mips.c index e6e837f73..1946455d8 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,6 +165,7 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } + if (!get_feature(msa)) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -178,3 +179,38 @@ void get_libname(void){ printf("mips\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 674b65908..8d6a1d93d 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -201,6 +201,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } + if (!get_feature(msa)) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -218,3 +219,38 @@ void get_libname(void){ printf("mips64\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + From a569fa15406439caec9762f49cd4556c35d3a5e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Nov 2021 23:26:48 +0100 Subject: [PATCH 537/681] MIPS P5600 and 24KC,1004K cpus do not support MSA --- getarch.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/getarch.c b/getarch.c index 60bfe05ce..8187a28fe 100644 --- a/getarch.c +++ b/getarch.c @@ -1013,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DP5600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "p5600" #define CORENAME "P5600" #else @@ -1027,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips1004K" #define CORENAME "MIPS1004K" #else @@ -1041,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips24K" #define CORENAME "MIPS24K" #else From 46947efb83b7302fba1db8740eee49e47fa7932b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 13 Nov 2021 23:32:26 +0100 Subject: [PATCH 538/681] Ignore compiler support for MIPS MSA if the cpu lacks this capability --- Makefile.system | 6 ++++-- param.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 16e8fcbd6..3b55fb104 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1664,8 +1664,10 @@ export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON -export HAVE_MSA -export MSA_FLAGS +ifndef NO_MSA + export HAVE_MSA + export MSA_FLAGS +endif export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/param.h b/param.h index 23f406d74..22f7b83ad 100644 --- a/param.h +++ b/param.h @@ -2884,7 +2884,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#ifdef HAVE_MSA +#if defined(HAVE_MSA) && !defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 From 3c7eed0e53c4ed8bd5169946fbd06854e193a2b2 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 14 Nov 2021 16:00:10 +0100 Subject: [PATCH 539/681] add remaining trmm copy rutines for SVE --- kernel/arm64/trmm_lncopy_sve_v1.c | 127 +++++++++++++++++++++++++++++ kernel/arm64/trmm_ltcopy_sve_v1.c | 128 +++++++++++++++++++++++++++++ kernel/arm64/trmm_uncopy_sve_v1.c | 130 ++++++++++++++++++++++++++++++ kernel/arm64/trmm_utcopy_sve_v1.c | 10 +-- 4 files changed, 389 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/trmm_lncopy_sve_v1.c create mode 100644 kernel/arm64/trmm_ltcopy_sve_v1.c create mode 100644 kernel/arm64/trmm_uncopy_sve_v1.c diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c new file mode 100644 index 000000000..e454e28d4 --- /dev/null +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -0,0 +1,127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + //printf("Using trmm_ln.\n"); + + int sve_len = svcntd(); + svint64_t index = svindex_s64(0LL, lda); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..86433f230 --- /dev/null +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + //printf("Using trmm_lt.\n"); + + int sve_len = svcntd(); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + + return 0; +} diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c new file mode 100644 index 000000000..21f392b62 --- /dev/null +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + //printf("Using trmm_un.\n"); + //printf("Using m %ld, n %ld.\n", m, n); + //printf("Using lda %ld.\n", lda); + //printf("Using posX %ld, posY %ld.\n", posX, posY); + + int sve_len = svcntd(); + svint64_t index = svindex_s64(0LL, lda); + + FLOAT *ao; + js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + /* svbool_t pm = svwhilelt_b64(i, m); */ + /* int m_active = svcntp_b64(svptrue_b64(), pm); */ + do + { + if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + //printf("\n"); + + + posY += n_active; + js += n_active; + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); + + return 0; +} diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index e44e67373..38b88dc8c 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -43,13 +43,11 @@ #include #endif -#define MIN(a,b) (((a)<(b))?(a):(b)) -#define MAX(a,b) (((a)>(b))?(a):(b)) - int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, j; + BLASLONG i, js; BLASLONG X; + //printf("Using trmm_ut.\n"); int sve_len = svcntd(); @@ -62,9 +60,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X = posX; if (posX <= posY) { - ao = a + posX + (posY + j) * lda; + ao = a + posX + posY * lda; } else { - ao = a + posY + (posX + j) * lda; + ao = a + posY + posX * lda; } i = 0; From de2ed6659685576aa7f0652b85070febadba05b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Mon, 15 Nov 2021 08:53:52 +0100 Subject: [PATCH 540/681] cmake: Set SUFFIX64 also for NOFORTRAN --- cmake/fc.cmake | 5 ----- cmake/system.cmake | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index f7aa4c5c9..9feda9be3 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,11 +3,6 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (INTERFACE64) - set(SUFFIX64 64) - set(SUFFIX64_UNDERSCORE _64) -endif() - if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) diff --git a/cmake/system.cmake b/cmake/system.cmake index bcca91c25..410cf01e5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -239,6 +239,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") From 302f22693a6dabe0d04d22e2e0e17d80c9242b6a Mon Sep 17 00:00:00 2001 From: Jia-Chen Date: Thu, 18 Nov 2021 21:14:43 +0800 Subject: [PATCH 541/681] MOD: optimize normal DGEMM on ARMV8 cortex-A53 & cortex-A55 --- kernel/arm64/KERNEL.CORTEXA53 | 2 +- kernel/arm64/KERNEL.CORTEXA55 | 2 +- kernel/arm64/dgemm_kernel_4x4_cortexa53.c | 890 ++++++++++++++++++++++ param.h | 2 +- 4 files changed, 893 insertions(+), 3 deletions(-) create mode 100644 kernel/arm64/dgemm_kernel_4x4_cortexa53.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index db322dd0d..aebd2e94c 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index db322dd0d..aebd2e94c 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -141,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) diff --git a/kernel/arm64/dgemm_kernel_4x4_cortexa53.c b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..5a9d284df --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,890 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +/********************************************************** + * Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12 + * Operation: C[4][12] += alpha * sa[4][K] * sb[K][12] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 3 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + /** prefetch 4x12 elements from matrix C for RW purpose */ + __asm__ __volatile__( + "mov x0,%[C]\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t" + ::[C]"r"(C), [LDC]"r"(LDC):"x0"); + + /** 3 pointers to 3 submatrices of sb respectively */ + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + const FLOAT *b3_ = sb + K * 8; + + /** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */ + /** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */ + /** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */ + /** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */ + /** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */ + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + /** fill registers holding elements of C with 0.0 */ + "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t" + "movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t" + "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t" + "movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t" + "movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t" + "movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + /** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */ + "ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t" + "ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t" + "blt 3f; beq 2f\n\t" + "1:\n\t" + /** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t" + "fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t" + "fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t" + "fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + /** tail part with k = 2 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "fmla v23.2d,v3.2d,v5.d[1]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "fmla v26.2d,v2.2d,v6.d[1]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "fmla v29.2d,v3.2d,v7.d[0]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "b 4f\n\t" + "3:\n\t" + /** tail part with k = 1 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t" + "fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t" + "fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t" + "ldr d4,[%[b3_]]\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "fmla v23.2d,v1.2d,v7.d[1]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "fmla v26.2d,v0.2d,v4.d[1]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "fmla v29.2d,v1.2d,v5.d[0]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + /** store 4x12 elements to C */ + "4:\n\t" + "ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t" + "fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t" + "fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t" + "fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t" + "fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t" + "fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t" + "fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t" + :[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K) + :[LDC]"r"(LDC), [alpha]"m"(alpha) + :"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +/********************************************************** + * Operation: + C[0] += alpha * up[0]; C[1] += alpha * up[1]; + C[2] += alpha * down[0]; C[3] += alpha * down[1]; + *********************************************************/ +static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) { + float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2); + t1 = vfmaq_n_f64(t1, up, alpha); + t2 = vfmaq_n_f64(t2, down, alpha); + vst1q_f64(C, t1); + vst1q_f64(C + 2, t2); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n8 + * Operation: C[4][8] += alpha * sa[4][K] * sb[K][8] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 2 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + + /** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */ + float64x2_t c11, c12, c13, c14, c15, c16, c17, c18; + float64x2_t c21, c22, c23, c24, c25, c26, c27, c28; + c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0); + c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + + float64x2_t b1 = vld1q_f64(b1_); + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + + float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + + float64x2_t b3 = vld1q_f64(b2_); + c15 = vfmaq_laneq_f64(c15, a1, b3, 0); + c25 = vfmaq_laneq_f64(c25, a2, b3, 0); + c16 = vfmaq_laneq_f64(c16, a1, b3, 1); + c26 = vfmaq_laneq_f64(c26, a2, b3, 1); + + float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4; + c17 = vfmaq_laneq_f64(c17, a1, b4, 0); + c27 = vfmaq_laneq_f64(c27, a2, b4, 0); + c18 = vfmaq_laneq_f64(c18, a1, b4, 1); + c28 = vfmaq_laneq_f64(c28, a2, b4, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); C += LDC; + dgemm_store_m4n1(C, c15, c25, alpha); C += LDC; + dgemm_store_m4n1(C, c16, c26, alpha); C += LDC; + dgemm_store_m4n1(C, c17, c27, alpha); C += LDC; + dgemm_store_m4n1(C, c18, c28, alpha); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n4 + * Operation: C[4][4] += alpha * sa[4][K] * sb[K][4] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: row-major (leading dimension == 4) + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11, c21, c12, c22, c13, c23, c14, c24; + c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb); + float64x2_t b2 = vld1q_f64(sb + 2); sb += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2; + c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2), + a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8; + c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1); + c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0); + c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0); + c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1); + c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1); + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + c12_1 = vaddq_f64(c12_1, c12_2); + c22_1 = vaddq_f64(c22_1, c22_2); + if (K) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1); + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC; + dgemm_store_m4n1(C, c12_1, c22_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2; + c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0); + c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1); + c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1); + sa += 8; + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + if (K) { + double b1 = *sb++; + c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1); + c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1); + sa += 4; + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m2n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = + c21 = c22 = c23 = c24 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + const FLOAT *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + + b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4; + c21 = vfmaq_laneq_f64(c21, a1, b1, 0); + c22 = vfmaq_laneq_f64(c22, a1, b1, 1); + c23 = vfmaq_laneq_f64(c23, a1, b2, 0); + c24 = vfmaq_laneq_f64(c24, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2; + c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2); + float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1); + + c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1); + c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0); + c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + c3_1 = vaddq_f64(c3_1, c3_2); + c4_1 = vaddq_f64(c4_1, c4_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1); + sa += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++); + sa += 2; + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha)); +} + +static inline void dgemm_store_m1n2(double *C, float64x2_t vc, + double alpha, BLASLONG LDC) { + double c0 = vgetq_lane_f64(vc, 0); + double c1 = vgetq_lane_f64(vc, 1); + C[0] += c0 * alpha; + C[LDC] += c1 * alpha; +} + +static inline void dgemm_kernel_arm64_4x4_m1n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4, c5, c6; + c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + const double *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1); + c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c6, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0); + c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1); + c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8; + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + double a1 = *sa++; + c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1); + c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1); + sb += 4; + } + + dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2_1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++); + sb += 2; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 7; K -= 8) { + c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa)); + c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2)); + c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4)); + c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6)); + sa += 8; sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + double cs1 = vpaddd_f64(c1); + for (; K; K--) { + cs1 += (*sa++) * (*sb++); + } + + C[0] += cs1 * alpha; +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + for (; N >= 12; N -= 12) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha); + } + sb += 12 * K; + C += 12 * LDC; + } + + if (N >= 8) { + N -= 8; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha); + } + sb += 8 * K; + C += 8 * LDC; + } else if (N >= 4) { + N -= 4; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha); + } + sb += 4 * K; + C += 4 * LDC; + } + + if (N >= 2) { + N -= 2; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha); + } + sb += 2 * K; + C += 2 * LDC; + } + + if (N) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha); + } + } + return 0; +} + diff --git a/param.h b/param.h index 22f7b83ad..ab85a5aac 100644 --- a/param.h +++ b/param.h @@ -3154,7 +3154,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 From feeb8283a593e780234c025edb1134e73696400d Mon Sep 17 00:00:00 2001 From: Caroline Newcombe Date: Fri, 19 Nov 2021 14:29:32 -0600 Subject: [PATCH 542/681] Fix unsafe read during final iteration of zsymv_L_sse2.S --- kernel/x86_64/zsymv_L_sse2.S | 16 ++++++- test_zhemv.c | 85 ++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 test_zhemv.c diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index cba167f4d..bfe0cf7ee 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -451,7 +451,6 @@ #endif MOVDDUP(4 * SIZE, A1, a1) - MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 @@ -471,7 +470,9 @@ subq IS, I subq $2, I sarq $2, I - jle .L15 + jle .L14 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) ALIGN_3 .L12: @@ -632,6 +633,16 @@ jg .L12 ALIGN_3 +.L14: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) + jmp .L15_pastcheck + .L15: movq M, I subq IS, I @@ -639,6 +650,7 @@ testq $2, I jle .L16 +.L15_pastcheck: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 diff --git a/test_zhemv.c b/test_zhemv.c new file mode 100644 index 000000000..6b3df5f7c --- /dev/null +++ b/test_zhemv.c @@ -0,0 +1,85 @@ +// reproduce segfault in zhemv() from zsymv_L_sse2.S +// + +#include +#include +#include +#include +#include +#include + +#define CALL_ZHEMV zhemv_ + +void zhemv_(char *UPLO, int *N, double *alpha, double *A, int *LDA, + double *X, int *INCX, double *beta, double *Y, int *INCY); + +int main () { + + // zhemv parameters + char uplo = 'L'; + int n = 14; + int lda = 16; + int incx = 1; + int incy = 1; + double *A, *X, *Y; + double alpha[] = {1, 0}; + double beta[] = {0, 0}; + + // other parameters + int i, j; + double *data, *data_end, *no_access; + double real, imag; + int size; + size_t len; + int A_offset; + + size = sizeof(complex double); + len = lda * lda * size; + + // allocate memory for data + // use mmap address hints to set up inaccessible memory section following data + no_access = mmap(NULL, len, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + data = mmap(no_access, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + data_end = data + (lda * lda * 2); + printf("data start/end: %p/%p. Blocked region starts at %p.\n", data, data_end, no_access); + + // set up pointer offsets into data + A_offset = (lda + 1) * 2; + A = data + A_offset * 2; // A starts in the third column of data matrix + X = data + A_offset + 2; // X is the second column of data matrix + Y = (double *)malloc(n * incy * size); // Y is stored elsewhere + printf("Address of data: %p; A: %p; X: %p; Y: %p.\n", data, A, X, Y); + + + // hermitian matrix + srand(lda); + for (j=0; j Date: Sat, 20 Nov 2021 16:35:29 +0100 Subject: [PATCH 543/681] symm SVE copy rutines --- kernel/arm64/symm_lcopy_sve.c | 96 +++++++++++++++++++++++++++++++++++ kernel/arm64/symm_ucopy_sve.c | 96 +++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 kernel/arm64/symm_lcopy_sve.c create mode 100644 kernel/arm64/symm_ucopy_sve.c diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c new file mode 100644 index 000000000..c3f7ea6b5 --- /dev/null +++ b/kernel/arm64/symm_lcopy_sve.c @@ -0,0 +1,96 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + + uint64_t sve_size = svcntd(); + svfloat64_t ao_vec; + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c new file mode 100644 index 000000000..3de416cf5 --- /dev/null +++ b/kernel/arm64/symm_ucopy_sve.c @@ -0,0 +1,96 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + + uint64_t sve_size = svcntd(); + svfloat64_t ao_vec; + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From 19ccef5fb15b848d9a3a1374e2571428e75bb8ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 20 Nov 2021 17:31:11 +0100 Subject: [PATCH 544/681] Add generic MIPS32 target --- param.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index ab85a5aac..6f978062b 100644 --- a/param.h +++ b/param.h @@ -2876,7 +2876,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 @@ -3154,7 +3154,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -3602,6 +3602,20 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define XGEMM_DEFAULT_UNROLL_M 1 #endif +#ifdef ARCH_MIPS +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 +#else #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p @@ -3622,6 +3636,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 +#endif #define SYMV_P 16 From b7df5001063f865509f198944c1809cbe8ae70ff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 20 Nov 2021 17:31:51 +0100 Subject: [PATCH 545/681] Add generic mips32 target --- kernel/mips/KERNEL.generic | 160 +++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 kernel/mips/KERNEL.generic diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips/KERNEL.generic @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c From a3cd36acff47be80ba1cbf60f556df3f84cdf20c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 20 Nov 2021 17:34:28 +0100 Subject: [PATCH 546/681] Add CMAKE support for cross-compiling to MIPS32 --- cmake/arch.cmake | 2 +- cmake/cc.cmake | 5 +++++ cmake/prebuild.cmake | 30 +++++++++++++++++++++++++++++- cmake/system_check.cmake | 6 ++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 57ee5a4fb..d468eb60b 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -109,7 +109,7 @@ if (${ARCH} STREQUAL "ia64") endif () endif () -if (MIPS64) +if (MIPS32 OR MIPS64) set(NO_BINARY_MODE 1) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 0ab1d4c1b..fdbb40ef6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (NO_BINARY_MODE) + if (MIPS32) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") + set(BINARY_DEFINED 1) + endif () + if (MIPS64) if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index d86e10035..259d9c738 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -416,7 +416,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) -elseif ("${TCORE}" STREQUAL "VORTEX") + elseif ("${TCORE}" STREQUAL "VORTEX") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" "#define L1_CODE_SIZE\t32768\n" @@ -439,6 +439,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "P5600") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 1048576\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) + elseif ("${TCORE}" MATCHES "MIPS") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 262144\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 72c48db37..f71ec4555 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -73,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) else () set(X86 1) endif() + elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") + set(MIPS32 1) elseif (${TARGET} STREQUAL "ARMV7") set(ARM 1) else() @@ -88,6 +90,10 @@ elseif(X86) set(ARCH "x86") elseif(PPC) set(ARCH "power") +elseif(MIPS32) + set(ARCH "mips") +elseif(MIPS64) + set(ARCH "mips64") elseif(ARM) set(ARCH "arm") elseif(ARM64) From 52a3f004a002d3ce163cf8a27cd774c65475aede Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 20 Nov 2021 23:54:48 +0100 Subject: [PATCH 547/681] Fix unintended reversion of recent CortexA53 changes --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 6f978062b..12e9b3634 100644 --- a/param.h +++ b/param.h @@ -2876,7 +2876,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) +#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 @@ -3154,7 +3154,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 From b58d4f31abf55446d4707036df0a0c5c7ef26047 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 21 Nov 2021 14:56:27 +0100 Subject: [PATCH 548/681] some clean-up & commentary --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/dgemm_kernel_sve_v1x8.S | 15 +++++---- kernel/arm64/dgemm_kernel_sve_v2x8.S | 38 +++++++++++++++------ kernel/arm64/dgemm_ncopy_sve_v1.c | 50 ++++++++++++++-------------- kernel/arm64/dgemm_tcopy_sve_v1.c | 48 +++++++++++++------------- kernel/arm64/dtrmm_kernel_sve_v1x8.S | 17 +++++----- kernel/arm64/trmm_lncopy_sve_v1.c | 9 ++--- kernel/arm64/trmm_ltcopy_sve_v1.c | 9 ++--- kernel/arm64/trmm_uncopy_sve_v1.c | 12 ++----- kernel/arm64/trmm_utcopy_sve_v1.c | 11 ++---- 10 files changed, 104 insertions(+), 107 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 572c96fac..dbf11fdca 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -143,7 +143,7 @@ endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMINCOPY = dgemm_ncopy_sve_v1.c diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S index 94682aea9..bbbd0fd95 100644 --- a/kernel/arm64/dgemm_kernel_sve_v1x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction + whilelt p1.d, counterI, origM cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 @@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S index 59e41559f..023d5ba92 100644 --- a/kernel/arm64/dgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + #define ASSEMBLER #include "common.h" @@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z7.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v00 ALPHA -> pA10_0 //v01 pA10_1 -//v02 -//v03 +//v02 pA20_0 +//v03 pA20_1 //v04 //v05 //v06 @@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v21 must save C5 //v22 must save C6 //v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 /******************************************************************************* * Macro definitions @@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA1] ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one - //incb pA1, all, mul #2 add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv2_BEGIN: mov counterI, #0 - cmp origM, vec_lenx2 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN blt .Ldgemm_kernel_L8_Mv1_BEGIN mov counterI, origM +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ mul temp, vec_len, origK // generate address of pA2 add pA2, pA1, temp, lsl #3 // pA1 = start of A array prfm PLDL1KEEP, [pA2] @@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp counterI, origM beq .Ldgemm_kernel_L8_END -////////////////////////////////// +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. .Ldgemm_kernel_L8_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq .Ldgemm_kernel_L4_END ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. .Ldgemm_kernel_L4_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. .Ldgemm_kernel_L2_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: @@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. .Ldgemm_kernel_L1_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c index 342812107..1f812c775 100644 --- a/kernel/arm64/dgemm_ncopy_sve_v1.c +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -40,40 +40,40 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - svint64_t lda_vec = svindex_s64(0LL, lda); - uint64_t sve_size = svcntd(); + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c index 33e69bf0c..cb645a1b6 100644 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -40,38 +40,38 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - uint64_t sve_size = svcntd(); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1(pg, (double *)aoffset1); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S index 1d4df08fb..1f8c9b20f 100644 --- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldtrmm_kernel_L8_BEGIN: @@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldtrmm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction - cntp lanes, p0, p1.d + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 .Ldtrmm_kernel_L8_Mv1_20: @@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldtrmm_kernel_L4_BEGIN: @@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldtrmm_kernel_L2_BEGIN: @@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldtrmm_kernel_L1_BEGIN: diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index e454e28d4..6c38cb3eb 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ln.\n"); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 86433f230..365be06c3 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_lt.\n"); - int sve_len = svcntd(); FLOAT *ao; @@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { ao ++; b += n_active; X ++; @@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - posY += n_active; js += n_active; diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 21f392b62..502b79928 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_un.\n"); - //printf("Using m %ld, n %ld.\n", m, n); - //printf("Using lda %ld.\n", lda); - //printf("Using posX %ld, posY %ld.\n", posX, posY); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 38b88dc8c..b45cbd7da 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ut.\n"); int sve_len = svcntd(); @@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { ao ++; b += n_active; X ++; @@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += n_active; X ++; i ++; - } else { + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); From 9388f05a3cab3b8850bb47c80ab8d10c1017692c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 21 Nov 2021 18:33:43 +0100 Subject: [PATCH 549/681] configure SVE Makefile --- getarch.c | 4 ++-- kernel/Makefile.L3 | 42 ++++++++++++++++++++++++++++++++++++++++++ param.h | 6 ++++-- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index 7ae7591c5..fa1fb582e 100644 --- a/getarch.c +++ b/getarch.c @@ -1207,7 +1207,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "armv8sve" #define CORENAME "ARMV8SVE" #endif @@ -1450,7 +1450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" #define LIBNAME "a64fx" #define CORENAME "A64FX" #else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 89691ef6f..05d91cded 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -77,6 +77,14 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +ifeq ($(CORE), A64FX) +HAVE_SVE = 1 +endif + +ifeq ($(CORE), ARMV8SVE) +HAVE_SVE = 1 +endif + ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c @@ -1531,6 +1539,31 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef HAVE_SVE +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ @@ -1554,6 +1587,7 @@ $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1789,11 +1823,19 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef HAVE_SVE +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_ucopy_sve.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_lcopy_sve.c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ diff --git a/param.h b/param.h index ad0cecda7..bbc52fac4 100644 --- a/param.h +++ b/param.h @@ -3294,12 +3294,14 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) +#elif defined(ARMV8SVE) || defined(A64FX) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 From 9b9cb90bb138208502ba913c22b11a5fb3516156 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 22 Nov 2021 09:54:20 +0100 Subject: [PATCH 550/681] modify Makefile for SVE copy --- kernel/Makefile.L3 | 66 ++++++++++++++++++------------------ kernel/arm64/KERNEL.A64FX | 6 ++++ kernel/arm64/KERNEL.ARMV8SVE | 8 +++++ 3 files changed, 47 insertions(+), 33 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 05d91cded..695f8ae70 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -77,14 +77,6 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -ifeq ($(CORE), A64FX) -HAVE_SVE = 1 -endif - -ifeq ($(CORE), ARMV8SVE) -HAVE_SVE = 1 -endif - ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c @@ -1539,49 +1531,55 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ -ifdef HAVE_SVE -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c +ifdef DTRMMUNCOPY_M +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_uncopy_sve_v1.c +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_lncopy_sve_v1.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ - -$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c +else +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_utcopy_sve_v1.c +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c +ifdef DTRMMLNCOPY_M +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : arm64/trmm_ltcopy_sve_v1.c +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else -$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ - -$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ - $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef DTRMMUTCOPY_M +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLTCOPY_M +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ @@ -1823,16 +1821,18 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ -ifdef HAVE_SVE -$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_ucopy_sve.c +ifdef DSYMMUCOPY_M +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ - -$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : arm64/symm_lcopy_sve.c - $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef DSYMMLCOPY_M +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ endif diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 4c2921e03..83536f12d 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -157,7 +157,13 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index dbf11fdca..1f605d10b 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -156,6 +156,14 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) From 531a28b6a0fdb908a82d5e1e6404146282b5af5a Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 22 Nov 2021 10:12:34 +0100 Subject: [PATCH 551/681] removed unused code (compiler warnings) --- kernel/arm64/symm_lcopy_sve.c | 5 +---- kernel/arm64/symm_ucopy_sve.c | 5 +---- kernel/arm64/trmm_lncopy_sve_v1.c | 1 - kernel/arm64/trmm_ltcopy_sve_v1.c | 2 -- kernel/arm64/trmm_uncopy_sve_v1.c | 1 - kernel/arm64/trmm_utcopy_sve_v1.c | 2 -- 6 files changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index c3f7ea6b5..94a68ad7c 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -42,12 +42,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; + BLASLONG i, offset; uint64_t sve_size = svcntd(); - svfloat64_t ao_vec; svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); svint64_t lda_vec = svdup_s64(lda); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 3de416cf5..3cf18e0fd 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -42,12 +42,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - BLASLONG i, js, offset; - - FLOAT data01, data02, data03, data04; + BLASLONG i, offset; uint64_t sve_size = svcntd(); - svfloat64_t ao_vec; svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); svint64_t lda_vec = svdup_s64(lda); diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index 6c38cb3eb..fc1b61325 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -48,7 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); FLOAT *ao; diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 365be06c3..14c6762d2 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); - FLOAT *ao; js = 0; svbool_t pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 502b79928..b8344d474 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -48,7 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); FLOAT *ao; diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index b45cbd7da..9be1c0abb 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - int sve_len = svcntd(); - FLOAT *ao; js = 0; svbool_t pn = svwhilelt_b64(js, n); From f4da23dcb6ac0de6a4c5fc07c704fb0b61ff5b25 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 23 Nov 2021 21:18:08 +0100 Subject: [PATCH 552/681] reduced dgemm_unroll_m to work with 128-bit sve --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index bbc52fac4..0ccc4a4d6 100644 --- a/param.h +++ b/param.h @@ -3301,7 +3301,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 #define CGEMM_DEFAULT_UNROLL_M 8 From 9f59b19fcd3df536197fe4f9e4d4572101d7d00b Mon Sep 17 00:00:00 2001 From: Jia-Chen Date: Wed, 24 Nov 2021 21:51:45 +0800 Subject: [PATCH 553/681] MOD: optimize zgemm on cortex-A53/cortex-A55 --- kernel/arm64/KERNEL.CORTEXA53 | 2 +- kernel/arm64/KERNEL.CORTEXA55 | 2 +- kernel/arm64/zgemm_kernel_4x4_cortexa53.c | 625 ++++++++++++++++++++++ 3 files changed, 627 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/zgemm_kernel_4x4_cortexa53.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index aebd2e94c..22c7fd20a 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index aebd2e94c..22c7fd20a 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -182,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c diff --git a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..4cdf85aa6 --- /dev/null +++ b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,625 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include + +static inline float64x2_t set_f64x2(double lo, double hi) { + float64x2_t ret = vdupq_n_f64(0); + ret = vsetq_lane_f64(lo, ret, 0); + ret = vsetq_lane_f64(hi, ret, 1); + return ret; +} + +static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) { + float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }}; + return ret; +} + +/***************************************************************** + * operation: *c += alpha * c_value //complex multiplication + * expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r } + * expanded_c: {{ arbr, aibr }, { arbi, aibi }} + ****************************************************************/ +static inline void store_1c(double *c, float64x2x2_t expanded_c, + float64x2x2_t expanded_alpha) { + float64x2_t ld = vld1q_f64(c); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#else + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#endif + ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real); + vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag)); +} + +static inline void pref_c_4(const double *c) { + __asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):); +} + +static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) { + float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]), + vaddq_f64(ec1.val[1], ec2.val[1]) }}; + return ret; +} + +static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) { + float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }}; + return ret; +} + +static inline float64x2x2_t init() { + float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }}; + return ret; +} + +static inline void kernel_1x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 3; K -= 4) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b2); + c3 = update_ec(c3, a3, b3); + c4 = update_ec(c4, a4, b4); + } + c1 = add_ec(c1, c2); + c3 = add_ec(c3, c4); + c1 = add_ec(c1, c3); + for (; K; K--) { + c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2; + } + store_1c(C, c1, expanded_alpha); +} + +static inline void kernel_2x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b2); + c4 = update_ec(c4, a4, b2); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t b1 = vld1q_f64(sb); + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); +} + +static inline void kernel_1x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a1, b2); + c3 = update_ec(c3, a2, b3); + c4 = update_ec(c4, a2, b4); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t a1 = vld1q_f64(sa); + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + LDC * 2, c2, expanded_alpha); +} + +static inline void kernel_2x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); +} + +static inline void kernel_4x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + pref_c_4(C); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + c3 = update_ec(c3, vld1q_f64(sa + 4), b1); + c4 = update_ec(c4, vld1q_f64(sa + 6), b1); + sa += 8; + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); +} + +static inline void kernel_4x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + pref_c_4(C); + pref_c_4(C + LDC * 2); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b1); + c4 = update_ec(c4, a4, b1); + c5 = update_ec(c5, a1, b2); + c6 = update_ec(c6, a2, b2); + c7 = update_ec(c7, a3, b2); + c8 = update_ec(c8, a4, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); + store_1c(C + 4, c7, expanded_alpha); + store_1c(C + 6, c8, expanded_alpha); +} + +static inline void kernel_1x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + c3 = update_ec(c3, a1, vld1q_f64(sb + 4)); + c4 = update_ec(c4, a1, vld1q_f64(sb + 6)); + sb += 8; + } + store_1c(C, c1, expanded_alpha); C += LDC * 2; + store_1c(C, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); C += LDC * 2; + store_1c(C, c4, expanded_alpha); +} + +static inline void kernel_2x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + c5 = update_ec(c5, a1, b3); + c6 = update_ec(c6, a2, b3); + c7 = update_ec(c7, a1, b4); + c8 = update_ec(c8, a2, b4); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); C += LDC * 2; + store_1c(C, c7, expanded_alpha); + store_1c(C + 2, c8, expanded_alpha); +} + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i, + float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) { + float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4); + up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar); + up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai); + lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai); + up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai); + up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar); + lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar); + vst2q_f64(C, up); + vst2q_f64(C + 4, lo); +} + +static inline void kernel_4x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + float64x2_t c1r, c1i, c2r, c2i; + float64x2_t c3r, c3i, c4r, c4i; + float64x2_t c5r, c5i, c6r, c6i; + float64x2_t c7r, c7i, c8r, c8i; + + const double *pref_ = C; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + "movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t" + "movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t" + "movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + "ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t" + "ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t" + "ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t" + "beq 2f; blt 3f\n\t" + "1:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + "fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + "fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + "fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + "fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t" + "3:\n\t" + "fmov v7.d[1],x0\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t" + "4:\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb) + ::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + + store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2; + store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2; + store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 4; n_left -= 4) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 8 * K; + C += 8 * LDC; + } + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 4 * K; + C += 4 * LDC; + } + if (n_left) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x1(a_, sb, c_, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(a_, sb, c_, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x1(a_, sb, c_, K, alphar, alphai); + } + } + return 0; +} + From fb891f33da3428602d765905b1395a9a7e798e4a Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 24 Nov 2021 14:07:28 -0600 Subject: [PATCH 554/681] Fix the cmake parser to identify more patterns --- cmake/utils.cmake | 192 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 147 insertions(+), 45 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 01b489f2a..c5ee65384 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,35 +15,83 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") - set (IfElse 0) - set (ElseSeen 0) + set (C_COMPILER ${CMAKE_C_COMPILER_ID}) + set (IfElse 0) + set (ElseSeen 0) + set (SkipIfs 0) + set (SkipElse 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) -#message(STATUS "parsing ${makefile_line}") + #message(STATUS "parsing ${makefile_line}") + # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. + # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. + if (${SkipElse} EQUAL 1) + #message(STATUS "skipping ${makefile_line}") + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + MATH(EXPR SkipIfs "${SkipIfs}+1") + endif () + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + set (SkipElse 0) + else () + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () + endif () + continue () + endif () + # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. if (${IfElse} GREATER 0) + # If the current scope is the one that has to be skipped, the if/endif/else statements + # along with it till the endif that closes the current scope have to be ignored as well. + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}+1") + continue () + endif () + endif () string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ENDIF ${makefile_line}") - set (IfElse 0) - set (ElseSeen 0) + if (${SkipIfs} EQUAL 0) + #message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + else () + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () continue () endif () string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ELSE ${makefile_line}") - set (ElseSeen 1) - continue () - endif() - if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) -# message(STATUS "skipping ${makefile_line}") - continue () + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + #message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + else () + #message(STATUS "skipping ${makefile_line}") + endif () + continue () + endif() + # Skip the lines that are not part of the path that has to be taken. + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) + #message(STATUS "skipping ${makefile_line}") + continue () endif () - endif () + endif () + # Skip commented lines (the ones that start with '#') + string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + continue () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on ${line_match}") + #message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) -# set(var_value ${CMAKE_MATCH_2}) + #set(var_value ${CMAKE_MATCH_2}) string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) @@ -54,39 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) endforeach () set(${var_name} ${var_value}) - else () - string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on include ${line_match}") - ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + continue () + endif () + # Include a new file to be parsed + string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "match on include ${line_match}") + ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + continue () + endif () + # The if statement that precedes this else has the path taken + # Thus, this else statement has to be skipped. + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + set (SkipElse 1) + continue() + endif() + # Example 1: ifdef HAVE_MSA + # Example 2: ifndef ZNRM2KERNEL + string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") + set (ElseSeen 0) + if (DEFINED ${CMAKE_MATCH_2}) + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + #message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () else () -# message(STATUS "unmatched line ${line_match}") - string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + set (IfElse 2) else () - string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) - set (CMAKE_MATCH_1 CMAKE_C_COMPILER) - endif () - if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () - endif () + #message (STATUS "condition is true") + set (IfElse 1) + endif () + endif () + continue () + endif () + # Example 1: ifeq ($(SGEMM_UNROLL_M), 16) + # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) + # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) + # Ignore the second group since (?:...) does not work on cmake + string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") + if (DEFINED ${CMAKE_MATCH_1}) + if (DEFINED ${CMAKE_MATCH_4}) + set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) + else () + set (STR ${${CMAKE_MATCH_1}}) + endif () + if (${STR} STREQUAL ${CMAKE_MATCH_5}) + #message (STATUS "condition is true") + set (IfElse 1) + continue () + endif () + endif () + set (IfElse 2) + continue () + endif () + # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) + string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") + set (ElseSeen 0) + set (HasValidGroup 0) + if (DEFINED ${CMAKE_MATCH_3}) + set (HasValidGroup 1) + set (STR ${${CMAKE_MATCH_3}}) + elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") + set (HasValidGroup 1) + set (STR ${CMAKE_MATCH_4}) + endif () + if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) + if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) + #message (STATUS "condition is true") + set (IfElse 1) + continue () endif () endif () + set (IfElse 2) + continue () endif () + #message(STATUS "unmatched line ${line_match}") endforeach () endmacro () From d5c9353f1bcf733fb666d1788d061c3f5107d5d5 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Wed, 24 Nov 2021 20:07:20 -0600 Subject: [PATCH 555/681] Modify the order that cmake set the KERNEL variables (generic now is fallback) --- cmake/kernel.cmake | 396 +++++++++++++++++++++--------------------- kernel/CMakeLists.txt | 4 +- 2 files changed, 202 insertions(+), 198 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 09ca5eb57..efededcf3 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,214 +1,218 @@ # helper functions for the kernel CMakeLists.txt +function(SetFallback KERNEL SOURCE_PATH) + if (NOT (DEFINED ${KERNEL})) + set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) + endif () +endfunction() -# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) - set(SAMAXKERNEL amax.S) - set(DAMAXKERNEL amax.S) - set(QAMAXKERNEL amax.S) - set(CAMAXKERNEL zamax.S) - set(ZAMAXKERNEL zamax.S) - set(XAMAXKERNEL zamax.S) - set(SAMINKERNEL amin.S) - set(DAMINKERNEL amin.S) - set(QAMINKERNEL amin.S) - set(CAMINKERNEL zamin.S) - set(ZAMINKERNEL zamin.S) - set(XAMINKERNEL zamin.S) - set(SMAXKERNEL max.S) - set(DMAXKERNEL max.S) - set(QMAXKERNEL max.S) - set(SMINKERNEL min.S) - set(DMINKERNEL min.S) - set(QMINKERNEL min.S) - set(ISAMAXKERNEL iamax.S) - set(IDAMAXKERNEL iamax.S) - set(IQAMAXKERNEL iamax.S) - set(ICAMAXKERNEL izamax.S) - set(IZAMAXKERNEL izamax.S) - set(IXAMAXKERNEL izamax.S) - set(ISAMINKERNEL iamin.S) - set(IDAMINKERNEL iamin.S) - set(IQAMINKERNEL iamin.S) - set(ICAMINKERNEL izamin.S) - set(IZAMINKERNEL izamin.S) - set(IXAMINKERNEL izamin.S) - set(ISMAXKERNEL iamax.S) - set(IDMAXKERNEL iamax.S) - set(IQMAXKERNEL iamax.S) - set(ISMINKERNEL iamin.S) - set(IDMINKERNEL iamin.S) - set(IQMINKERNEL iamin.S) - set(SASUMKERNEL asum.S) - set(DASUMKERNEL asum.S) - set(CASUMKERNEL zasum.S) - set(ZASUMKERNEL zasum.S) - set(QASUMKERNEL asum.S) - set(XASUMKERNEL zasum.S) - set(SAXPYKERNEL axpy.S) - set(DAXPYKERNEL axpy.S) - set(CAXPYKERNEL zaxpy.S) - set(ZAXPYKERNEL zaxpy.S) - set(QAXPYKERNEL axpy.S) - set(XAXPYKERNEL zaxpy.S) - set(SCOPYKERNEL copy.S) - set(DCOPYKERNEL copy.S) - set(CCOPYKERNEL zcopy.S) - set(ZCOPYKERNEL zcopy.S) - set(QCOPYKERNEL copy.S) - set(XCOPYKERNEL zcopy.S) - set(SDOTKERNEL dot.S) - set(DDOTKERNEL dot.S) - set(CDOTKERNEL zdot.S) - set(ZDOTKERNEL zdot.S) - set(QDOTKERNEL dot.S) - set(XDOTKERNEL zdot.S) - set(SNRM2KERNEL nrm2.S) - set(DNRM2KERNEL nrm2.S) - set(QNRM2KERNEL nrm2.S) - set(CNRM2KERNEL znrm2.S) - set(ZNRM2KERNEL znrm2.S) - set(XNRM2KERNEL znrm2.S) - set(SROTKERNEL rot.S) - set(DROTKERNEL rot.S) - set(QROTKERNEL rot.S) - set(CROTKERNEL zrot.S) - set(ZROTKERNEL zrot.S) - set(XROTKERNEL zrot.S) - set(SSCALKERNEL scal.S) - set(DSCALKERNEL scal.S) - set(CSCALKERNEL zscal.S) - set(ZSCALKERNEL zscal.S) - set(QSCALKERNEL scal.S) - set(XSCALKERNEL zscal.S) - set(SSWAPKERNEL swap.S) - set(DSWAPKERNEL swap.S) - set(CSWAPKERNEL zswap.S) - set(ZSWAPKERNEL zswap.S) - set(QSWAPKERNEL swap.S) - set(XSWAPKERNEL zswap.S) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SCABS_KERNEL ../generic/cabs.c) - set(DCABS_KERNEL ../generic/cabs.c) - set(QCABS_KERNEL ../generic/cabs.c) - set(LSAME_KERNEL ../generic/lsame.c) - set(SAXPBYKERNEL ../arm/axpby.c) - set(DAXPBYKERNEL ../arm/axpby.c) - set(CAXPBYKERNEL ../arm/zaxpby.c) - set(ZAXPBYKERNEL ../arm/zaxpby.c) - set(SSUMKERNEL sum.S) - set(DSUMKERNEL sum.S) - set(CSUMKERNEL zsum.S) - set(ZSUMKERNEL zsum.S) - set(QSUMKERNEL sum.S) - set(XSUMKERNEL zsum.S) + SetFallback(SAMAXKERNEL amax.S) + SetFallback(DAMAXKERNEL amax.S) + SetFallback(QAMAXKERNEL amax.S) + SetFallback(CAMAXKERNEL zamax.S) + SetFallback(ZAMAXKERNEL zamax.S) + SetFallback(XAMAXKERNEL zamax.S) + SetFallback(SAMINKERNEL amin.S) + SetFallback(DAMINKERNEL amin.S) + SetFallback(QAMINKERNEL amin.S) + SetFallback(CAMINKERNEL zamin.S) + SetFallback(ZAMINKERNEL zamin.S) + SetFallback(XAMINKERNEL zamin.S) + SetFallback(SMAXKERNEL max.S) + SetFallback(DMAXKERNEL max.S) + SetFallback(QMAXKERNEL max.S) + SetFallback(SMINKERNEL min.S) + SetFallback(DMINKERNEL min.S) + SetFallback(QMINKERNEL min.S) + SetFallback(ISAMAXKERNEL iamax.S) + SetFallback(IDAMAXKERNEL iamax.S) + SetFallback(IQAMAXKERNEL iamax.S) + SetFallback(ICAMAXKERNEL izamax.S) + SetFallback(IZAMAXKERNEL izamax.S) + SetFallback(IXAMAXKERNEL izamax.S) + SetFallback(ISAMINKERNEL iamin.S) + SetFallback(IDAMINKERNEL iamin.S) + SetFallback(IQAMINKERNEL iamin.S) + SetFallback(ICAMINKERNEL izamin.S) + SetFallback(IZAMINKERNEL izamin.S) + SetFallback(IXAMINKERNEL izamin.S) + SetFallback(ISMAXKERNEL iamax.S) + SetFallback(IDMAXKERNEL iamax.S) + SetFallback(IQMAXKERNEL iamax.S) + SetFallback(ISMINKERNEL iamin.S) + SetFallback(IDMINKERNEL iamin.S) + SetFallback(IQMINKERNEL iamin.S) + SetFallback(SASUMKERNEL asum.S) + SetFallback(DASUMKERNEL asum.S) + SetFallback(CASUMKERNEL zasum.S) + SetFallback(ZASUMKERNEL zasum.S) + SetFallback(QASUMKERNEL asum.S) + SetFallback(XASUMKERNEL zasum.S) + SetFallback(SAXPYKERNEL axpy.S) + SetFallback(DAXPYKERNEL axpy.S) + SetFallback(CAXPYKERNEL zaxpy.S) + SetFallback(ZAXPYKERNEL zaxpy.S) + SetFallback(QAXPYKERNEL axpy.S) + SetFallback(XAXPYKERNEL zaxpy.S) + SetFallback(SCOPYKERNEL copy.S) + SetFallback(DCOPYKERNEL copy.S) + SetFallback(CCOPYKERNEL zcopy.S) + SetFallback(ZCOPYKERNEL zcopy.S) + SetFallback(QCOPYKERNEL copy.S) + SetFallback(XCOPYKERNEL zcopy.S) + SetFallback(SDOTKERNEL dot.S) + SetFallback(DDOTKERNEL dot.S) + SetFallback(CDOTKERNEL zdot.S) + SetFallback(ZDOTKERNEL zdot.S) + SetFallback(QDOTKERNEL dot.S) + SetFallback(XDOTKERNEL zdot.S) + SetFallback(SNRM2KERNEL nrm2.S) + SetFallback(DNRM2KERNEL nrm2.S) + SetFallback(QNRM2KERNEL nrm2.S) + SetFallback(CNRM2KERNEL znrm2.S) + SetFallback(ZNRM2KERNEL znrm2.S) + SetFallback(XNRM2KERNEL znrm2.S) + SetFallback(SROTKERNEL rot.S) + SetFallback(DROTKERNEL rot.S) + SetFallback(QROTKERNEL rot.S) + SetFallback(CROTKERNEL zrot.S) + SetFallback(ZROTKERNEL zrot.S) + SetFallback(XROTKERNEL zrot.S) + SetFallback(SSCALKERNEL scal.S) + SetFallback(DSCALKERNEL scal.S) + SetFallback(CSCALKERNEL zscal.S) + SetFallback(ZSCALKERNEL zscal.S) + SetFallback(QSCALKERNEL scal.S) + SetFallback(XSCALKERNEL zscal.S) + SetFallback(SSWAPKERNEL swap.S) + SetFallback(DSWAPKERNEL swap.S) + SetFallback(CSWAPKERNEL zswap.S) + SetFallback(ZSWAPKERNEL zswap.S) + SetFallback(QSWAPKERNEL swap.S) + SetFallback(XSWAPKERNEL zswap.S) + SetFallback(SGEMVNKERNEL gemv_n.S) + SetFallback(SGEMVTKERNEL gemv_t.S) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SCABS_KERNEL ../generic/cabs.c) + SetFallback(DCABS_KERNEL ../generic/cabs.c) + SetFallback(QCABS_KERNEL ../generic/cabs.c) + SetFallback(LSAME_KERNEL ../generic/lsame.c) + SetFallback(SAXPBYKERNEL ../arm/axpby.c) + SetFallback(DAXPBYKERNEL ../arm/axpby.c) + SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(SSUMKERNEL sum.S) + SetFallback(DSUMKERNEL sum.S) + SetFallback(CSUMKERNEL zsum.S) + SetFallback(ZSUMKERNEL zsum.S) + SetFallback(QSUMKERNEL sum.S) + SetFallback(XSUMKERNEL zsum.S) if (BUILD_BFLOAT16) - set(SHAMINKERNEL ../arm/amin.c) - set(SHAMAXKERNEL ../arm/amax.c) - set(SHMAXKERNEL ../arm/max.c) - set(SHMINKERNEL ../arm/min.c) - set(ISHAMAXKERNEL ../arm/iamax.c) - set(ISHAMINKERNEL ../arm/iamin.c) - set(ISHMAXKERNEL ../arm/imax.c) - set(ISHMINKERNEL ../arm/imin.c) - set(SHASUMKERNEL ../arm/asum.c) - set(SHAXPYKERNEL ../arm/axpy.c) - set(SHAXPBYKERNEL ../arm/axpby.c) - set(SHCOPYKERNEL ../arm/copy.c) - set(SBDOTKERNEL ../x86_64/sbdot.c) - set(SHROTKERNEL ../arm/rot.c) - set(SHSCALKERNEL ../arm/scal.c) - set(SHNRM2KERNEL ../arm/nrm2.c) - set(SHSUMKERNEL ../arm/sum.c) - set(SHSWAPKERNEL ../arm/swap.c) - set(TOBF16KERNEL ../x86_64/tobf16.c) - set(BF16TOKERNEL ../x86_64/bf16to.c) - set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) - set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) + SetFallback(SHAMINKERNEL ../arm/amin.c) + SetFallback(SHAMAXKERNEL ../arm/amax.c) + SetFallback(SHMAXKERNEL ../arm/max.c) + SetFallback(SHMINKERNEL ../arm/min.c) + SetFallback(ISHAMAXKERNEL ../arm/iamax.c) + SetFallback(ISHAMINKERNEL ../arm/iamin.c) + SetFallback(ISHMAXKERNEL ../arm/imax.c) + SetFallback(ISHMINKERNEL ../arm/imin.c) + SetFallback(SHASUMKERNEL ../arm/asum.c) + SetFallback(SHAXPYKERNEL ../arm/axpy.c) + SetFallback(SHAXPBYKERNEL ../arm/axpby.c) + SetFallback(SHCOPYKERNEL ../arm/copy.c) + SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) + SetFallback(SHROTKERNEL ../arm/rot.c) + SetFallback(SHSCALKERNEL ../arm/scal.c) + SetFallback(SHNRM2KERNEL ../arm/nrm2.c) + SetFallback(SHSUMKERNEL ../arm/sum.c) + SetFallback(SHSWAPKERNEL ../arm/swap.c) + SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) + SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () macro(SetDefaultL2) - set(SGEMVNKERNEL ../arm/gemv_n.c) - set(SGEMVTKERNEL ../arm/gemv_t.c) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SGERKERNEL ../generic/ger.c) - set(DGERKERNEL ../generic/ger.c) - set(QGERKERNEL ../generic/ger.c) - set(CGERUKERNEL ../generic/zger.c) - set(CGERCKERNEL ../generic/zger.c) - set(ZGERUKERNEL ../generic/zger.c) - set(ZGERCKERNEL ../generic/zger.c) - set(XGERUKERNEL ../generic/zger.c) - set(XGERCKERNEL ../generic/zger.c) - set(SSYMV_U_KERNEL ../generic/symv_k.c) - set(SSYMV_L_KERNEL ../generic/symv_k.c) - set(DSYMV_U_KERNEL ../generic/symv_k.c) - set(DSYMV_L_KERNEL ../generic/symv_k.c) - set(QSYMV_U_KERNEL ../generic/symv_k.c) - set(QSYMV_L_KERNEL ../generic/symv_k.c) - set(CSYMV_U_KERNEL ../generic/zsymv_k.c) - set(CSYMV_L_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) - set(XSYMV_U_KERNEL ../generic/zsymv_k.c) - set(XSYMV_L_KERNEL ../generic/zsymv_k.c) - set(CHEMV_U_KERNEL ../generic/zhemv_k.c) - set(CHEMV_L_KERNEL ../generic/zhemv_k.c) - set(CHEMV_V_KERNEL ../generic/zhemv_k.c) - set(CHEMV_M_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) - set(XHEMV_U_KERNEL ../generic/zhemv_k.c) - set(XHEMV_L_KERNEL ../generic/zhemv_k.c) - set(XHEMV_V_KERNEL ../generic/zhemv_k.c) - set(XHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) + SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SGERKERNEL ../generic/ger.c) + SetFallback(DGERKERNEL ../generic/ger.c) + SetFallback(QGERKERNEL ../generic/ger.c) + SetFallback(CGERUKERNEL ../generic/zger.c) + SetFallback(CGERCKERNEL ../generic/zger.c) + SetFallback(ZGERUKERNEL ../generic/zger.c) + SetFallback(ZGERCKERNEL ../generic/zger.c) + SetFallback(XGERUKERNEL ../generic/zger.c) + SetFallback(XGERCKERNEL ../generic/zger.c) + SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) - set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) - set(SHGERKERNEL ../generic/ger.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) + SetFallback(SHGERKERNEL ../generic/ger.c) endif () endmacro () macro(SetDefaultL3) - set(SGEADD_KERNEL ../generic/geadd.c) - set(DGEADD_KERNEL ../generic/geadd.c) - set(CGEADD_KERNEL ../generic/zgeadd.c) - set(ZGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(SGEADD_KERNEL ../generic/geadd.c) + SetFallback(DGEADD_KERNEL ../generic/geadd.c) + SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) if (BUILD_BFLOAT16) - set(SHGEADD_KERNEL ../generic/geadd.c) - set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SBGEMM_BETA ../generic/gemm_beta.c) - set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMINCOPYOBJ sbgemm_incopy.o) - set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) - set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) - set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) + SetFallback(SHGEADD_KERNEL ../generic/geadd.c) + SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) + SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) + SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9c8460723..fb2c94fc7 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (${DYNAMIC_ARCH}) include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") endif () + ParseMakefileVars("${KERNELDIR}/KERNEL") + ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") SetDefaultL1() SetDefaultL2() SetDefaultL3() - ParseMakefileVars("${KERNELDIR}/KERNEL") - ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) if(NOT NO_LAPACK) From 5c1cd5e0c2347fef3114c2b431e4bc990193031e Mon Sep 17 00:00:00 2001 From: Jia-Chen Date: Thu, 25 Nov 2021 22:48:48 +0800 Subject: [PATCH 556/681] MOD: add comments to a53 zgemm kernel --- kernel/arm64/zgemm_kernel_4x4_cortexa53.c | 113 +++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c index 4cdf85aa6..aa0f7d72d 100644 --- a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c +++ b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c @@ -25,9 +25,120 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include +#include "common.h" #include +/******************************************************************************* + The complex GEMM kernels in OpenBLAS use static configuration of conjugation +modes via specific macros: + + MACRO_NAME | conjugation on matrix A | conjugation on matrix B | + ---------- | ----------------------- | ----------------------- | + NN/NT/TN/TT | No | No | + NR/NC/TR/TC | No | Yes | + RN/RT/CN/CT | Yes | No | + RR/RC/CR/CC | Yes | Yes | + + "conjugation on matrix A" means the complex conjugates of elements from +matrix A are used for matmul (rather than the original elements). "conjugation +on matrix B" means the complex conjugate of each element from matrix B is taken +for matrix multiplication, respectively. + + Complex numbers in arrays or matrices are usually packed together as an +array of struct (without padding): + struct complex_number { + FLOAT real_part; + FLOAT imag_part; + }; + + For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF +DOUBLE, the real part of its Kth complex number can be accessed as +ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1]. + + This file uses 2 ways to vectorize matrix multiplication of complex numbers: + +(1) Expanded-form + + During accumulation along direction K: + + Σk(a[0][k].real b[k][n].real) + accumulate Σk(a[0][k].imag b[k][n].real) + -------------------> . + | * b[k][n].real . + | (broadcasted) . + a[0][k].real Σk(a[v-1][k].real b[k][n].real) + a[0][k].imag Σk(a[v-1][k].imag b[k][n].real) + . VECTOR I +(vec_a) . + . + a[v-1][k].real Σk(a[0][k].real b[k][n].imag) + a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag) + | . + | accumulate . + -------------------> . + * b[k][n].imag Σk(a[v-1][k].real b[k][n].imag) + (broadcasted) Σk(a[v-1][k].imag b[k][n].imag) + VECTOR II + + After accumulation, prior to storage: + + -1 -Σk(a[0][k].imag b[k][n].imag) + 1 Σk(a[0][k].real b[k][n].imag) + . . + VECTOR II permute and multiply . to get . + . . + -1 -Σk(a[v-1][k].imag b[k][n].imag) + 1 Σk(a[v-1][k].real b[k][n].imag) + + then add with VECTOR I to get the result vector of elements of C. + + 2 vector registers are needed for every v elements of C, with +v == sizeof(vector) / sizeof(complex) + +(2) Contracted-form + + During accumulation along direction K: + + (the K coordinate is not shown, since the operation is identical for each k) + + (load vector in mem) (load vector in mem) + a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i + | | + | unzip operation (or VLD2 in arm neon) | + ----------------------------------------------------- + | + | + -------------------------------------------------- + | | + | | + v v + a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag + | | | | + | | * b[i].imag(broadcast) | | + * b[i].real | -----------------------------|---- | * b[i].real + (broadcast) | | | | (broadcast) + | ------------------------------ | | + + | - | * b[i].imag(broadcast) + | + | + v v v v + (accumulate) (accumulate) + c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag + VECTOR_REAL VECTOR_IMAG + + After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved) +then stored to matrix C directly. + + For 2v elements of C, only 2 vector registers are needed, while +4 registers are required for expanded-form. +(v == sizeof(vector) / sizeof(complex)) + + For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers +to store elements of C when using expanded-form calculation, where +the register spilling will occur. So contracted-form operation is +selected for 4x4 kernel. As for all other combinations of unroll parameters +(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more +NEON registers into usage to hide latency of multiply-add instructions. +******************************************************************************/ + static inline float64x2_t set_f64x2(double lo, double hi) { float64x2_t ret = vdupq_n_f64(0); ret = vsetq_lane_f64(lo, ret, 0); From c3b1e55bdcd1602142d34f838bb35fa5c231b749 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Nov 2021 09:38:41 +0100 Subject: [PATCH 557/681] AzureCI: Fetch alpine-chroot-install from master to get key updates (#3460) * Fetch alpine-chroot-install from master to get key updates --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 286a620ba..29e3ca586 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -249,8 +249,8 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ - && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/master/alpine-chroot-install \ + && echo '484fbf313f311da93e913bfdd81ef4df934aa907 alpine-chroot-install' | sha1sum -c \ || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' From 1af73ce38e75863c06d434b8f3bd2105df9143b1 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Fri, 26 Nov 2021 10:35:01 +0100 Subject: [PATCH 558/681] Adapt CMake for SVE --- cmake/cc.cmake | 18 ++++++++++++++++++ kernel/CMakeLists.txt | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 0ab1d4c1b..153cdce61 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -139,6 +139,24 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS) endif () endif () +if (${CORE} STREQUAL A64FX) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL ARMV8SVE) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9c8460723..80c7dcd8b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -418,32 +418,50 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) # symm for s and d +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) From ca65a4e91d945f6df8fdbe3cca55af943725653e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Fri, 26 Nov 2021 13:11:19 +0100 Subject: [PATCH 559/681] update CONTRIBUTORS.md --- CONTRIBUTORS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6be41960c..39ec96246 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -197,3 +197,7 @@ In chronological order: * River Dillon * [2021-07-10] fix compilation with musl libc + +* Bine Brank + * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE + * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM From fbfc8b1b83010e380d09dfa292c9b03b4c50439d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Nov 2021 13:39:49 +0100 Subject: [PATCH 560/681] Update alpine-chroot-install again --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 29e3ca586..c3a07ffe5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -249,9 +249,9 @@ jobs: vmImage: 'ubuntu-latest' steps: - script: | - wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/master/alpine-chroot-install \ - && echo '484fbf313f311da93e913bfdd81ef4df934aa907 alpine-chroot-install' | sha1sum -c \ - || exit 1 + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ + && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ + || exit 1 alpine() { /alpine/enter-chroot -u "$USER" "$@"; } sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' alpine make DYNAMIC_ARCH=1 BINARY=64 From 86ae89bf33f28780ccaa1044376c94401545b806 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 28 Nov 2021 18:12:47 +0100 Subject: [PATCH 561/681] add sgemm kernel and copy functions for sgemm and ssymm --- kernel/Makefile.L3 | 10 + kernel/arm64/KERNEL.A64FX | 34 +- kernel/arm64/sgemm_kernel_sve_v1x8.S | 874 +++++++++++++++++++++++++++ kernel/arm64/sgemm_ncopy_sve_v1.c | 78 +++ kernel/arm64/sgemm_tcopy_sve_v1.c | 77 +++ kernel/arm64/symm_lcopy_sve.c | 50 ++ kernel/arm64/symm_ucopy_sve.c | 50 ++ param.h | 4 +- 8 files changed, 1151 insertions(+), 26 deletions(-) create mode 100644 kernel/arm64/sgemm_kernel_sve_v1x8.S create mode 100644 kernel/arm64/sgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/sgemm_tcopy_sve_v1.c diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 695f8ae70..593e33dde 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1809,11 +1809,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef SSYMMUCOPY_M +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef SSYMMLCOPY_M +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 83536f12d..ee66fea8e 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -114,35 +114,21 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif +SGEMMKERNEL = sgemm_kernel_sve_v1x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_8x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(SGEMM_UNROLL_N), 16) -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S diff --git a/kernel/arm64/sgemm_kernel_sve_v1x8.S b/kernel/arm64/sgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..88c74bc0f --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v1x8.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 4 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..1bc186335 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_sve_v1.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0LL, lda); + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..9f8cf502a --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1(pg, (float *) aoffset1); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index 94a68ad7c..6ba4afc8b 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, offset; +#if defined(DOUBLE) uint64_t sve_size = svcntd(); svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); @@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + return 0; } diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 3cf18e0fd..32da5bd16 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -44,6 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, offset; +#if defined(DOUBLE) uint64_t sve_size = svcntd(); svint64_t posY_vec = svdup_s64(posY); svint64_t posX_vec = svdup_s64(posX); @@ -89,5 +90,54 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + return 0; } diff --git a/param.h b/param.h index c1dff1367..e9419bd9d 100644 --- a/param.h +++ b/param.h @@ -3296,8 +3296,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(ARMV8SVE) || defined(A64FX) -#define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 8 /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ From c78fdcc80d86659ccf703fd0f74e962d22efde9b Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Tue, 16 Nov 2021 14:47:41 -0600 Subject: [PATCH 562/681] [POWER] Add support for SMALL_MATRIX_OPT --- Makefile.system | 2 + kernel/power/KERNEL.POWER10 | 20 + kernel/power/dgemm_small_kernel_nn_power10.c | 923 +++++++++ kernel/power/dgemm_small_kernel_nt_power10.c | 581 ++++++ kernel/power/dgemm_small_kernel_tn_power10.c | 882 +++++++++ kernel/power/dgemm_small_kernel_tt_power10.c | 829 ++++++++ .../power/gemm_small_kernel_permit_power10.c | 84 + kernel/power/sgemm_small_kernel_nn_power10.c | 1563 +++++++++++++++ kernel/power/sgemm_small_kernel_nt_power10.c | 887 +++++++++ kernel/power/sgemm_small_kernel_tn_power10.c | 1678 +++++++++++++++++ kernel/power/sgemm_small_kernel_tt_power10.c | 1559 +++++++++++++++ 11 files changed, 9008 insertions(+) create mode 100644 kernel/power/dgemm_small_kernel_nn_power10.c create mode 100644 kernel/power/dgemm_small_kernel_nt_power10.c create mode 100644 kernel/power/dgemm_small_kernel_tn_power10.c create mode 100644 kernel/power/dgemm_small_kernel_tt_power10.c create mode 100644 kernel/power/gemm_small_kernel_permit_power10.c create mode 100644 kernel/power/sgemm_small_kernel_nn_power10.c create mode 100644 kernel/power/sgemm_small_kernel_nt_power10.c create mode 100644 kernel/power/sgemm_small_kernel_tn_power10.c create mode 100644 kernel/power/sgemm_small_kernel_tt_power10.c diff --git a/Makefile.system b/Makefile.system index 3b55fb104..96302ff0e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -258,6 +258,8 @@ endif #For small matrix optimization ifeq ($(ARCH), x86_64) SMALL_MATRIX_OPT = 1 +else ifeq ($(CORE), POWER10) +SMALL_MATRIX_OPT = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 63816cb5f..79d889fe0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -32,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c + DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = DGEMMITCOPY = @@ -42,6 +52,16 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c + CGEMMKERNEL = cgemm_kernel_power10.S #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..ecdc3e5c6 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -0,0 +1,923 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + ra2 = vec_xl(0, A+((K)*lda)+M+4); \ + ra3 = vec_xl(0, A+((K)*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + +#define LOAD_A_1x2(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((K)*lda)+M+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \ + LOAD_PAIR(pb1, rb2, rb3); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) \ + rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(pb0, pb1, offset) \ + *((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \ + *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; + +#define LOAD_PACKED_B(pb0, pb1, offset) \ + pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ + pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packB; + if (has_packing) packB = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n8; n += 8) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb2, pb3, 8); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + } + + for (; n < N; n++) { + for (m = 0; m < m8; m += 8) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + vector double result2 = ((vector double){0.,0.}); + vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + + for (; m < m4; m += 4) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + + for (; m < m2; m += 2) { + vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + + for (; m < M; m++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free(packB); + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..7cc8c9f6c --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nt_power10.c @@ -0,0 +1,581 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); \ + ra2 = vec_xl(0, A+(K*lda)+M+4); \ + ra3 = vec_xl(0, A+(K*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+(K*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..93a942b02 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -0,0 +1,882 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra2, ra3); \ + t1 = vec_mergel(ra2, ra3); \ + ra2 = t0; \ + ra3 = t1; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra4 = t0; \ + ra5 = t1; \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra6, ra7); \ + t1 = vec_mergel(ra6, ra7); \ + ra6 = t0; \ + ra7 = t1; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((M+0)*lda)+K+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb1, rb0, rb1); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_MMA_1ACC_(acc, b0, a0) \ + __builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + // workaround to avoid register spilling + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC_(acc0, pb0, ra0); + KERNEL_MMA_1ACC_(acc1, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc2, pb0, ra0); + KERNEL_MMA_1ACC_(acc3, pb0, ra1); + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n+4, k); + KERNEL_MMA_1ACC_(acc4, pb0, ra0); + KERNEL_MMA_1ACC_(acc5, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc6, pb0, ra0); + KERNEL_MMA_1ACC_(acc7, pb0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc4, n+4, m+0); + SAVE_4x2_ACC(&acc6, n+4, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + SAVE_4x2_ACC(&acc5, n+4, m+2); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..b47b6201f --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tt_power10.c @@ -0,0 +1,829 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergeh(ra6, ra7); \ + t2 = vec_mergel(ra4, ra5); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = t0; \ + ra5 = t2; \ + ra6 = t1; \ + ra7 = t3; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+((K)*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*8)+0+offset); \ + vec_xst(ra1, 0, packA+(k*8)+2+offset); \ + vec_xst(ra2, 0, packA+(k*8)+4+offset); \ + vec_xst(ra3, 0, packA+(k*8)+6+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*8)+0+offset); \ + ra1 = vec_xl(0, packA+(k*8)+2+offset); \ + ra2 = vec_xl(0, packA+(k*8)+4+offset); \ + ra3 = vec_xl(0, packA+(k*8)+6+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packA; + if (has_packing) packA = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + PACK_A(ra1, ra3, ra5, ra7, 8); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if(has_packing) free(packA); + + return 0; +} diff --git a/kernel/power/gemm_small_kernel_permit_power10.c b/kernel/power/gemm_small_kernel_permit_power10.c new file mode 100644 index 000000000..9b38e457b --- /dev/null +++ b/kernel/power/gemm_small_kernel_permit_power10.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + +#if defined(DOUBLE) // dgemm + + // gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This + // issue affects both dgemm_nn and dgemm_tn. +#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)) + if (!transb) + return 0; +#endif + + if (MNK <= 54.0*54.0*54.0) + return 1; + +#else // sgemm + +#if defined(__GNUC__) && defined(__clang__) + // clang generates code with register spilling for the region of code with + // packing, thus, we had to disable this optimization for clang. Given that + // the packing on-demand used in this work is one of the reasons that lead the + // small kernels to outperform the normal flow (when MNK increases), with it + // disabled we had to reduce the MNK inputs used by the code generated by clang. + if (MNK > 84.0*84.0*84.0) + return 0; + + if (transa && !transb) { + // sgemm_tn works better when packing on-demand is used + if (MNK <= 64.0*64.0*64.0 && K >= 4) + return 1; + else + return 0; + } + +#else // gcc + + if (MNK > 100.0*100.0*100.0) + return 0; + +#endif + + // Multi-threading execution outperforms (or approaches) the execution of the + // small kernel. + if (num_cpu_avail(3) > 1) { + if (MNK <= 64.0*64.0*64.0) + return 1; + } else { + return 1; + } + +#endif + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_nn_power10.c b/kernel/power/sgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..59222a436 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nn_power10.c @@ -0,0 +1,1563 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() \ + __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); \ + ra2 = vec_xl(0, A+((K)*lda)+M+8); \ + ra3 = vec_xl(0, A+((K)*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+((K)*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+((K)*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[(K)*lda+M]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(n+0)*ldb+k], rb0, 0); \ + rb0 = vec_insert(B[(n+1)*ldb+k], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(rb0, rb1, rb2, rb3, offset) \ + vec_xst(rb0, 0, packB+(k*16)+0+offset); \ + vec_xst(rb1, 0, packB+(k*16)+4+offset); \ + vec_xst(rb2, 0, packB+(k*16)+8+offset); \ + vec_xst(rb3, 0, packB+(k*16)+12+offset); + +#define LOAD_PACKED_B(rb0, rb1, rb2, rb3, offset) \ + rb0 = vec_xl(0, packB+(k*16)+0+offset); \ + rb1 = vec_xl(0, packB+(k*16)+4+offset); \ + rb2 = vec_xl(0, packB+(k*16)+8+offset); \ + rb3 = vec_xl(0, packB+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packB; + if (has_packing) packB = (float *)malloc(K*16*sizeof(float)); + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n16; n += 16) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb4, rb8, rb12, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb5, rb9, rb13, 16); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb2, rb6, rb10, rb14, 32); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb3, rb7, rb11, rb15, 48); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb2, rb4, rb6, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb3, rb5, rb7, 16); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb1, rb2, rb3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_16x4(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_16x2(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_16x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + } + + for (; n < n8; n += 8) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb0, rb4, rb0, rb4, rb0, rb4, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb1, rb5, rb1, rb5, rb1, rb5, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb2, rb6, rb2, rb6, rb2, rb6, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb3, rb7, rb3, rb7, rb3, rb7, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb0, rb2, rb0, rb2, rb0, rb2, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb1, rb3, rb1, rb3, rb1, rb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb0, rb4, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb1, rb5, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb2, rb6, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb3, rb7, ra0, ra0, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb0, rb2, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb1, rb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; m < M; m++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + } + + for (; n < N; n++) { + for (m = 0; m < m16; m += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + + for (; m < m8; m += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + + for (; m < m4; m += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + + for (; m < M; m++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packB); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_nt_power10.c b/kernel/power/sgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..20d3c6b0e --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nt_power10.c @@ -0,0 +1,887 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); \ + ra2 = vec_xl(0, A+(K*lda)+M+8); \ + ra3 = vec_xl(0, A+(K*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M+0]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); \ + rb2 = vec_xl(0, B+(K*ldb)+N+8); \ + rb3 = vec_xl(0, B+(K*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + vector float result = ((vector float){0., 0., 0., 0.}); + vector float result1 = ((vector float){0., 0., 0., 0.}); + vector float result2 = ((vector float){0., 0., 0., 0.}); + vector float result3 = ((vector float){0., 0., 0., 0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n+0, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tn_power10.c b/kernel/power/sgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..64ecddbba --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tn_power10.c @@ -0,0 +1,1678 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2, 2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2, 3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[(M+0)*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_2x1(M, K) \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc5, n+0, m+12); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n+0, m+0); + SAVE_2x4_ACC(&acc1, n+0, m+4); + SAVE_2x4_ACC(&acc2, n+0, m+8); + SAVE_2x4_ACC(&acc3, n+0, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb8, rb8, rb12, rb12, + ra0, ra4, ra0, ra4, ra0, ra4, ra0, ra4); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb9, rb9, rb13, rb13, + ra1, ra5, ra1, ra5, ra1, ra5, ra1, ra5); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb10, rb10, rb14, rb14, + ra2, ra6, ra2, ra6, ra2, ra6, ra2, ra6); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb11, rb11, rb15, rb15, + ra3, ra7, ra3, ra7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb4, rb4, rb6, rb6, + ra0, ra2, ra0, ra2, ra0, ra2, ra0, ra2); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb5, rb5, rb7, rb7, + ra1, ra3, ra1, ra3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb2, rb2, rb3, rb3, + ra0, ra1, ra0, ra1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc4, n+8, m+0); + SAVE_4x4_ACC(&acc5, n+8, m+4); + SAVE_4x4_ACC(&acc6, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb4, rb4, ra0, ra4, ra0, ra4); + KERNEL_MMA_4ACC(rb1, rb1, rb5, rb5, ra1, ra5, ra1, ra5); + KERNEL_MMA_4ACC(rb2, rb2, rb6, rb6, ra2, ra6, ra2, ra6); + KERNEL_MMA_4ACC(rb3, rb3, rb7, rb7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb2, rb2, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(rb1, rb1, rb3, rb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb1, rb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra1, ra1, ra1, ra1); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra2, ra2, ra2, ra2); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0 = ((vector float){0.,0.,0.,0.}); + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tt_power10.c b/kernel/power/sgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..71bc7b937 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tt_power10.c @@ -0,0 +1,1559 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2,2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2,3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[M*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[M*lda+K]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); \ + rb2 = vec_xl(0, B+((K)*ldb)+N+8); \ + rb3 = vec_xl(0, B+((K)*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+((K)*ldb)+N); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+((K)*ldb)+N, 8); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + SAVE_4x4_ACC(&acc4, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_B_1x2(k, n); + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra4, ra4, ra4, ra4); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra5, ra5, ra5, ra5); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra2, ra2, ra2, ra2, ra6, ra6, ra6, ra6); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra3, ra3, ra3, ra3, ra7, ra7, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra3, ra3, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra4, ra4); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra5, ra5); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra2, ra2, ra6, ra6); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra3, ra3, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra2, ra2); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 1); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 3); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} From 0de36f7b5ceea1c410ed98e62fd4748e9cc9324d Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 29 Nov 2021 21:25:05 +0100 Subject: [PATCH 563/681] trmm sve copy fucntions for single precision --- kernel/arm64/trmm_lncopy_sve_v1.c | 21 ++++++++++++++++++--- kernel/arm64/trmm_ltcopy_sve_v1.c | 15 +++++++++++++++ kernel/arm64/trmm_uncopy_sve_v1.c | 21 ++++++++++++++++++--- kernel/arm64/trmm_utcopy_sve_v1.c | 15 +++++++++++++++ 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index fc1b61325..918e945ac 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - svint64_t index = svindex_s64(0LL, lda); - - FLOAT *ao; js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X > posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); ao ++; b += n_active; @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; } diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 14c6762d2..b76cc56de 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; +#ifdef DOUBLE svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X < posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); ao += lda; b += n_active; @@ -112,9 +121,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index b8344d474..75fa163ae 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -48,12 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - svint64_t index = svindex_s64(0LL, lda); - - FLOAT *ao; js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -68,7 +73,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON do { if (X < posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); ao ++; b += n_active; @@ -113,9 +122,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; } diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 9be1c0abb..36a03242a 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -50,8 +50,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; +#ifdef DOUBLE svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif do { X = posX; @@ -72,7 +77,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X > posY) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); ao += lda; b += n_active; @@ -111,9 +120,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; +#ifdef DOUBLE pn = svwhilelt_b64(js, n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif return 0; } From 7b5b93037d681e3540b5ddd8344435ee9cd5ba60 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Wed, 1 Dec 2021 19:30:40 +0530 Subject: [PATCH 564/681] Fix truncated assembler checks --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 3b55fb104..43a3d7c6c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -15,7 +15,7 @@ ifeq ($(HOSTARCH), amd64) HOSTARCH=x86_64 endif -HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) +HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) From 9a45b5123f413dfa7ddc4f57ebbdd0a511e3f96c Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Thu, 2 Dec 2021 13:29:38 +0530 Subject: [PATCH 565/681] Update Makefile.system --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 43a3d7c6c..c78500eb6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -15,8 +15,6 @@ ifeq ($(HOSTARCH), amd64) HOSTARCH=x86_64 endif -HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) - # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -368,6 +366,8 @@ GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif +HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) + # # OS dependent settings # From 1470b7e4deef168ecc25ec02cd69abacb0bd1fe2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Dec 2021 11:41:53 +0100 Subject: [PATCH 566/681] Delete test_zhemv.c --- test_zhemv.c | 85 ---------------------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 test_zhemv.c diff --git a/test_zhemv.c b/test_zhemv.c deleted file mode 100644 index 6b3df5f7c..000000000 --- a/test_zhemv.c +++ /dev/null @@ -1,85 +0,0 @@ -// reproduce segfault in zhemv() from zsymv_L_sse2.S -// - -#include -#include -#include -#include -#include -#include - -#define CALL_ZHEMV zhemv_ - -void zhemv_(char *UPLO, int *N, double *alpha, double *A, int *LDA, - double *X, int *INCX, double *beta, double *Y, int *INCY); - -int main () { - - // zhemv parameters - char uplo = 'L'; - int n = 14; - int lda = 16; - int incx = 1; - int incy = 1; - double *A, *X, *Y; - double alpha[] = {1, 0}; - double beta[] = {0, 0}; - - // other parameters - int i, j; - double *data, *data_end, *no_access; - double real, imag; - int size; - size_t len; - int A_offset; - - size = sizeof(complex double); - len = lda * lda * size; - - // allocate memory for data - // use mmap address hints to set up inaccessible memory section following data - no_access = mmap(NULL, len, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); - data = mmap(no_access, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); - data_end = data + (lda * lda * 2); - printf("data start/end: %p/%p. Blocked region starts at %p.\n", data, data_end, no_access); - - // set up pointer offsets into data - A_offset = (lda + 1) * 2; - A = data + A_offset * 2; // A starts in the third column of data matrix - X = data + A_offset + 2; // X is the second column of data matrix - Y = (double *)malloc(n * incy * size); // Y is stored elsewhere - printf("Address of data: %p; A: %p; X: %p; Y: %p.\n", data, A, X, Y); - - - // hermitian matrix - srand(lda); - for (j=0; j Date: Sat, 4 Dec 2021 22:24:02 +0100 Subject: [PATCH 567/681] Fix DYNAMIC_ARCH builds with CMAKE on OSX and add corresponding test to Azure CI (#3409) * Use linker response files and a custom link command to get around ARG_MAX limitations on OSX * Reconfigure a redundant job on Azure to test shared library builds with CMAKE and DYNAMIC_ARCH on OSX --- CMakeLists.txt | 16 ++++++++++++++++ azure-pipelines.yml | 10 ++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cdbb8c306..447b32e14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -244,6 +244,22 @@ if(ANDROID) endif() endif() +if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) + set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + if (NOT NOFORTRAN) + set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set (CMAKE_Fortran_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") + else () + set (CMAKE_C_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + endif () +endif() + # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c3a07ffe5..56daa9e5a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -165,7 +165,7 @@ jobs: make ctest -- job: OSX_OpenMP_Clang_gf_cmake +- job: OSX_dynarch_cmake pool: vmImage: 'macOS-10.15' variables: @@ -173,14 +173,12 @@ jobs: LIBRARY_PATH: /usr/local/opt/llvm/lib steps: - script: | - brew update - brew install llvm libomp mkdir build cd build - cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. - make + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake --build . ctest - + - job: OSX_Ifort_Clang pool: vmImage: 'macOS-10.15' From abe1ce3434c3cd6df0c8d00650d1722a31bff784 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 5 Dec 2021 14:03:08 +0100 Subject: [PATCH 568/681] strmm sve v1x8 kernel --- kernel/arm64/strmm_kernel_sve_v1x8.S | 1008 ++++++++++++++++++++++++++ 1 file changed, 1008 insertions(+) create mode 100644 kernel/arm64/strmm_kernel_sve_v1x8.S diff --git a/kernel/arm64/strmm_kernel_sve_v1x8.S b/kernel/arm64/strmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..3c45e3e29 --- /dev/null +++ b/kernel/arm64/strmm_kernel_sve_v1x8.S @@ -0,0 +1,1008 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.s, p1/m, z20.s, alphaZ + st1w z20.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.s, p1/m, z21.s, alphaZ + st1w z21.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.s, p1/m, z22.s, alphaZ + st1w z22.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.s, p1/m, z23.s, alphaZ + st1w z23.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lstrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lstrmm_kernel_L8_Mv1_22a + + .align 5 +.Lstrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_Mv1_22 + + .align 5 +.Lstrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lstrmm_kernel_L8_Mv1_44 + + .align 5 +.Lstrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lstrmm_kernel_L8_Mv1_44 + +.Lstrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lstrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_Mv1_100 + + .align 5 +.Lstrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L8_Mv1_46 + +.Lstrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L8_Mv1_20 + +.Lstrmm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L4_Mv1_44 + + .align 5 +.Lstrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_Mv1_22 + +.Lstrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L4_Mv1_100 + + .align 5 +.Lstrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L4_Mv1_46 + +.Lstrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L4_Mv1_20 + + +.Lstrmm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lstrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L2_Mv1_44 + + .align 5 +.Lstrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_Mv1_22 + +.Lstrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L2_Mv1_100 + + .align 5 +.Lstrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L2_Mv1_46 + +.Lstrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Lstrmm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L2_Mv1_20 + + +.Lstrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lstrmm_kernel_L1_Mv1_44 + + .align 5 +.Lstrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_22 + +.Lstrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L1_Mv1_100 + + .align 5 +.Lstrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_46 + +.Lstrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Lstrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L1_Mv1_20 + + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 8d11278e28c32dc782f46f48cb439711e9439d5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 5 Dec 2021 14:38:41 +0100 Subject: [PATCH 569/681] Fix hardcoded library name --- cpp_thread_test/Makefile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 81e3470ef..be8313e65 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -1,13 +1,14 @@ -include ../Makefile.rule +TOPDIR = .. +include $(TOPDIR)/Makefile.system all :: dgemv_tester dgemm_tester dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ./dgemv_tester dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ./dgemm_tester clean :: From a1fea1fe2aed7d5169d85b132195c3a80116599f Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 5 Dec 2021 18:47:29 +0100 Subject: [PATCH 570/681] sgemm v2x8 SVE kernel --- kernel/arm64/sgemm_kernel_sve_v2x8.S | 1683 ++++++++++++++++++++++++++ 1 file changed, 1683 insertions(+) create mode 100644 kernel/arm64/sgemm_kernel_sve_v2x8.S diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..1cdd8253e --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* This is an SVE sgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha w18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 s10 +#define alphaZ z7.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 + dup z24.s, #0 + dup z25.s, #0 + dup z26.s, #0 + dup z27.s, #0 + dup z28.s, #0 + dup z29.s, #0 + dup z30.s, #0 + dup z31.s, #0 +.endm + +.macro KERNELv2x8_I + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + ld1w z2.s, p0/z, [pA1, vec_len, lsl #2] + ld1w z3.s, p0/z, [pA2, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M1 + ld1w z2.s, p0/z, [pA1] + ld1w z3.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M2 + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_E + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s +.endm + +.macro KERNELv2x8_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + fmla z24.s, p0/m, z0.s, z12.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.s, p0/m, z1.s, z12.s + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z8.s, p0/z, [pCRow2] + ld1w z9.s, p0/z, [pCRow2, #1, mul vl] + fmla z8.s, p0/m, z24.s, alphaZ + fmla z9.s, p0/m, z25.s, alphaZ + st1w z8.s, p0, [pCRow2] + st1w z9.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z26.s, alphaZ + fmla z11.s, p0/m, z27.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z28.s, alphaZ + fmla z13.s, p0/m, z29.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z30.s, alphaZ + fmla z15.s, p0/m, z31.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv2x4_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv2x2_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 +.endm + +.macro INITv2x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv2x1_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA1] + ld1w z1.s, p1/z, [pA1, lanes, lsl #2] // next one + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + cntw vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Lsgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Lsgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv2_22a + + .align 5 +.Lsgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv2_22 + + .align 5 +.Lsgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Lsgemm_kernel_L8_Mv2_44 + + .align 5 +.Lsgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Lsgemm_kernel_L8_Mv2_44 + +.Lsgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Lsgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv2_100 + + .align 5 +.Lsgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv2_46 + +.Lsgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Lsgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv1_22a + + .align 5 +.Lsgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv1_22 + + .align 5 +.Lsgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lsgemm_kernel_L8_Mv1_44 + + .align 5 +.Lsgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lsgemm_kernel_L8_Mv1_44 + +.Lsgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lsgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv1_100 + + .align 5 +.Lsgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv1_46 + +.Lsgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Lsgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lsgemm_kernel_L8_Mv1_20 + +.Lsgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv2_44 + + .align 5 +.Lsgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv2_22 + +.Lsgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv2_100 + + .align 5 +.Lsgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv2_46 + +.Lsgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Lsgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv1_44 + + .align 5 +.Lsgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv1_22 + +.Lsgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv1_100 + + .align 5 +.Lsgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv1_46 + +.Lsgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lsgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L4_Mv1_20 + + +.Lsgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lsgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv2_44 + + .align 5 +.Lsgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv2_22 + +.Lsgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv2_100 + + .align 5 +.Lsgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv2_46 + +.Lsgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Lsgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv1_44 + + .align 5 +.Lsgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv1_22 + +.Lsgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv1_100 + + .align 5 +.Lsgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv1_46 + +.Lsgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Lsgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L2_Mv1_20 + + +.Lsgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + + .align 5 +.Lsgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv2_44 + + .align 5 +.Lsgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_22 + +.Lsgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv2_100 + + .align 5 +.Lsgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_46 + +.Lsgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Lsgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv1_44 + + .align 5 +.Lsgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_22 + +.Lsgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv1_100 + + .align 5 +.Lsgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_46 + +.Lsgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Lsgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L1_Mv1_20 + + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From dd1f645371b9a59e63afef45277e23b578258ab2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Dec 2021 19:42:51 +0100 Subject: [PATCH 571/681] switch DGEMM unroll parameters for SkylakeX if DYNAMIC_ARCH --- param.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/param.h b/param.h index c1dff1367..e5e736622 100644 --- a/param.h +++ b/param.h @@ -1669,14 +1669,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_M 16 +#else +#define DGEMM_DEFAULT_UNROLL_M 4 +#endif #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_N 2 +#else +#define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From 5378046abdb8461083fb638d2470b5a7f1624a48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Dec 2021 19:43:54 +0100 Subject: [PATCH 572/681] roll back DGEMM kernels to 4x8 when compiling for DYNAMIC_ARCH --- kernel/x86_64/KERNEL.SKYLAKEX | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 6b4961bc2..d2d7de42a 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -20,6 +20,7 @@ SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_skylakex.c SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_skylakex.c SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_skylakex.c +ifndef DYNAMIC_ARCH DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -27,6 +28,11 @@ DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +else +DGEMMKERNEL = dgemm_kernel_4x8_skylakex_2.c +DGEMMONCOPY = dgemm_ncopy_8_skylakex.c +DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c +endif DGEMM_SMALL_M_PERMIT = dgemm_small_kernel_permit_skylakex.c DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_skylakex.c DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_skylakex.c From eee3381cbecdd252177d53657ca964fd45a5f9b6 Mon Sep 17 00:00:00 2001 From: kavanabhat Date: Wed, 8 Dec 2021 03:52:23 -0600 Subject: [PATCH 573/681] Fallback for Power kernels --- Makefile.system | 13 ++++++++++--- param.h | 5 +++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index 2fc37f031..b8824fe51 100644 --- a/Makefile.system +++ b/Makefile.system @@ -145,8 +145,13 @@ endif ifeq ($(TARGET), POWER8) GETARCH_FLAGS := -DFORCE_POWER6 endif +ifeq ($(TARGET), POWER9) +GETARCH_FLAGS := -DFORCE_POWER6 +endif +ifeq ($(TARGET), POWER10) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif - #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. # @@ -267,6 +272,10 @@ endif ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 +# Determine if the assembler is GNU Assembler +HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) +GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) + # Generating Makefile.conf and config.h DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) @@ -368,8 +377,6 @@ GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif -HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) - # # OS dependent settings # diff --git a/param.h b/param.h index c1dff1367..9632960b3 100644 --- a/param.h +++ b/param.h @@ -2598,8 +2598,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(HAVE_GAS) && (HAVE_GAS == 1) +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 From 3e9a52869c70f5140f434e923153f93ec8b14a2c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 8 Dec 2021 22:18:44 +0100 Subject: [PATCH 574/681] Fix ar path in ARMV7 Darwin NDK build on Azure (#3473) * Adjust ar commad in ARMV7 Darwin NDK build after homebrew update to NDK 23b --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56daa9e5a..710940924 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -220,7 +220,7 @@ jobs: brew update brew install --cask android-ndk export ANDROID_NDK_HOME=/usr/local/share/android-ndk - make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 - job: OSX_IOS_ARMV8 pool: From 214fbcee1549a152ec1a2ef21d3f00479f6cc299 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Thu, 9 Dec 2021 08:28:17 -0600 Subject: [PATCH 575/681] Fix cmake for power --- cmake/cc.cmake | 28 ++++++++++++++++++++++++++++ cmake/system.cmake | 34 ++++++++++++++++++++++++++++++++-- cmake/system_check.cmake | 6 +++--- driver/others/CMakeLists.txt | 2 ++ 4 files changed, 65 insertions(+), 5 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index b28209c0c..06bc14986 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -161,6 +161,34 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL POWER10) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) + endif() + endif () +endif () + +if (${CORE} STREQUAL POWER9) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") + endif () + endif () +endif () + +if (${CORE} STREQUAL POWER8) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif () +endif () if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) diff --git a/cmake/system.cmake b/cmake/system.cmake index 410cf01e5..b4cce20f5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -42,6 +42,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () + if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") + set(TARGET "POWER6") + endif () endif () @@ -102,6 +105,12 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") endif () +if (POWER) + set (NO_WARMUP 1) + execute_process(COMMAND bash -c "as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null" RESULT_VARIABLE HAVE_GAS) + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") +endif() + #if don't use Fortran, it will only compile CBLAS. if (ONLY_CBLAS) set(NO_LAPACK 1) @@ -222,6 +231,27 @@ if (DEFINED TARGET) if (DEFINED HAVE_SSE4_1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") endif() + + if (${TARGET} STREQUAL POWER10) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") + endif() + endif() + if (${TARGET} STREQUAL POWER9) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") + endif() + endif() + if (${TARGET} STREQUAL POWER8) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif() endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") @@ -279,7 +309,7 @@ if (NEED_PIC) endif() endif () -if (X86_64) +if (X86_64 OR ${CORE} STREQUAL POWER10) set(SMALL_MATRIX_OPT TRUE) endif () if (SMALL_MATRIX_OPT) @@ -287,7 +317,7 @@ if (SMALL_MATRIX_OPT) endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR PPC) + if (X86 OR X86_64 OR ARM64 OR POWER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index f71ec4555..86ce3dfb0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -35,7 +35,7 @@ if(CMAKE_CL_64 OR MINGW64) elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") - set(PPC 1) + set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") @@ -88,7 +88,7 @@ if (X86_64) set(ARCH "x86_64") elseif(X86) set(ARCH "x86") -elseif(PPC) +elseif(POWER) set(ARCH "power") elseif(MIPS32) set(ARCH "mips") @@ -103,7 +103,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) + if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) set(BINARY 64) else () set(BINARY 32) diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index a07e00b3b..1a38740a3 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) list(APPEND COMMON_SOURCES dynamic_arm64.c) + elseif (POWER) + list(APPEND COMMON_SOURCES dynamic_power.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () From 23a756135388d919b3dbdcc026e4c961f6db7ef6 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Thu, 9 Dec 2021 09:57:39 -0600 Subject: [PATCH 576/681] Fix error cmake (small kernels) --- kernel/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d49b4ea64..9849ddc93 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -596,11 +596,11 @@ endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) endif () if (BUILD_BFLOAT16) if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) @@ -634,11 +634,11 @@ endif () GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") - GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") - GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") endif () endif () From d38110a5cee593feb96e851d65e77e49c48c3702 Mon Sep 17 00:00:00 2001 From: Rafael Cardoso Fernandes Sousa Date: Fri, 10 Dec 2021 17:35:28 -0600 Subject: [PATCH 577/681] Use CMake variables instead of as --- cmake/system.cmake | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index b4cce20f5..e0e92bde7 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -106,10 +106,16 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) endif () if (POWER) - set (NO_WARMUP 1) - execute_process(COMMAND bash -c "as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null" RESULT_VARIABLE HAVE_GAS) - set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") -endif() + set(NO_WARMUP 1) + set(HAVE_GAS 1) + if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") + set(HAVE_GAS 0) + elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") + set(HAVE_GAS 0) + endif () + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") +endif () #if don't use Fortran, it will only compile CBLAS. if (ONLY_CBLAS) From 774267fdac4594f027916979b064e0151c1a2b9e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 11 Dec 2021 16:35:08 +0100 Subject: [PATCH 578/681] adjust Makefile.L3 for SVE --- kernel/Makefile.L3 | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 593e33dde..d22bd46a5 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1483,29 +1483,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ +ifdef STRMMUNCOPY_M +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef STRMMLNCOPY_M +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef STRMMUTCOPY_M +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef STRMMLTCOPY_M +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ From a8f62a347bb6a9d653f0b57bf5a05b5e3cd097a8 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 11 Dec 2021 16:37:23 +0100 Subject: [PATCH 579/681] fix UNROLL_MN and add to targets for SVE --- kernel/arm64/KERNEL.A64FX | 9 ++++++-- kernel/arm64/KERNEL.ARMV8SVE | 40 +++++++++++++++--------------------- param.h | 8 ++++++++ 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ee66fea8e..80be4ddd0 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -114,8 +114,8 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_8x$(SGEMM_UNROLL_N).S +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S SGEMMINCOPY = sgemm_ncopy_sve_v1.c SGEMMITCOPY = sgemm_tcopy_sve_v1.c @@ -127,6 +127,11 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + SSYMMUCOPY_M = symm_ucopy_sve.c SSYMMLCOPY_M = symm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 1f605d10b..0364a929c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -114,35 +114,27 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -ifeq ($(SGEMM_UNROLL_M), 16) -SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S -else -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -endif -ifeq ($(SGEMM_UNROLL_M), 4) -SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S -else -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -endif +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ifeq ($(SGEMM_UNROLL_N), 16) -SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S -else -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -endif -ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S -else -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S diff --git a/param.h b/param.h index e9419bd9d..f7b8eb07b 100644 --- a/param.h +++ b/param.h @@ -3296,14 +3296,22 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(ARMV8SVE) || defined(A64FX) +/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 8 +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If SVE size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_MN 32 + #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 From b610d2de37f24b53cf780b26b2577708f914b458 Mon Sep 17 00:00:00 2001 From: Jia-Chen Date: Sun, 12 Dec 2021 17:22:52 +0800 Subject: [PATCH 580/681] optimize cgemm on ARM cortex A53 & cortex A55 --- kernel/arm64/KERNEL.CORTEXA53 | 2 +- kernel/arm64/KERNEL.CORTEXA55 | 2 +- kernel/arm64/cgemm_kernel_8x4_cortexa53.c | 898 ++++++++++++++++++++++ 3 files changed, 900 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/cgemm_kernel_8x4_cortexa53.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index 22c7fd20a..e2e006770 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -169,7 +169,7 @@ endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 index 22c7fd20a..e2e006770 100644 --- a/kernel/arm64/KERNEL.CORTEXA55 +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -169,7 +169,7 @@ endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c diff --git a/kernel/arm64/cgemm_kernel_8x4_cortexa53.c b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c new file mode 100644 index 000000000..f9cd97852 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c @@ -0,0 +1,898 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_m8n1_contracted(float *C, + float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i, + float alphar, float alphai) { + + float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8); + ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar); + ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai); + ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai); + ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar); + vst2q_f32(C, ld1); + vst2q_f32(C + 8, ld2); +} + +static inline void kernel_8x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + const float *c_pref = C; + float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i; + float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i; + + /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */ + /** v0-v1 and v10-v11 for B, v2-v9 for A */ + __asm__ __volatile__( + "cmp %[K],#0; mov %[c_pref],%[C]\n\t" + "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t" + "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f\n\t" + "cmp %[K],#2\n\t" + "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t" + "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t" + "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t" + "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t" + "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t" + + "blt 3f; beq 2f\n\t" + "1:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0; fmov d0,x5\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" + FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t" + FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" + FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t" + FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t" + FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t" + FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t" + FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t" + FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "b 4f\n\t" + "3:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + "4:\n\t" + "mov %[c_pref],%[C]\n\t" + "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 %[c2i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t" + "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t" + "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref) + :[C]"r"(C), [LDC]"r"(LDC) + :"cc","memory","x0","x1","x2","x3","x4","x5","x6", + "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"); + + store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai); +} + +static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc, + float32x4_t a, float32x4_t b) { + + acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1); + acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2); + acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3); + return acc; +} + +static inline float32x4x4_t expand_alpha(float alphar, float alphai) { + float32x4x4_t ret; + const float maskp[] = { -1, 1, -1, 1 }; + const float maskn[] = { 1, -1, 1, -1 }; + const float32x4_t vrevp = vld1q_f32(maskp); + const float32x4_t vrevn = vld1q_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(-alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevn); + ret.val[3] = vmulq_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevp); + ret.val[3] = vmulq_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevn); + ret.val[1] = vmulq_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(-alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevp); + ret.val[1] = vmulq_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline void store_expanded_m2n2(float *C, BLASLONG LDC, + float32x4x4_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + acc.val[2] = vrev64q_f32(acc.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + acc.val[3] = vrev64q_f32(acc.val[3]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]); + vst1q_f32(C, ld1); + vst1q_f32(C + LDC * 2, ld2); +} + +static inline float32x4x4_t init_expanded_m2n2() { + float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_4x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4), + b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + c1 = acc_expanded_m2n2(c1, a3, b3); + c2 = acc_expanded_m2n2(c2, a4, b3); + c3 = acc_expanded_m2n2(c3, a3, b4); + c4 = acc_expanded_m2n2(c4, a4, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + C += LDC * 4; + store_expanded_m2n2(C, LDC, c3, e_alpha); + store_expanded_m2n2(C + 4, LDC, c4, e_alpha); +} + +static inline void kernel_8x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20); + float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + c1 = acc_expanded_m2n2(c1, a5, b2); + c2 = acc_expanded_m2n2(c2, a6, b2); + c3 = acc_expanded_m2n2(c3, a7, b2); + c4 = acc_expanded_m2n2(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + store_expanded_m2n2(C + 8, LDC, c3, e_alpha); + store_expanded_m2n2(C + 12, LDC, c4, e_alpha); +} + +static inline void kernel_4x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c1 = acc_expanded_m2n2(c1, a3, b2); + c2 = acc_expanded_m2n2(c2, a4, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + c1 = acc_expanded_m2n2(c1, a2, b3); + c2 = acc_expanded_m2n2(c2, a2, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]); + c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]); + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + } + + store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai)); +} + +static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc, + float32x4_t a, float32x2_t b) { + + acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m2n1(float *C, + float32x4x2_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1q_f32(C, ld1); +} + +static inline float32x4x2_t init_expanded_m2n1() { + float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_8x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12), + a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20), + a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + c1 = acc_expanded_m2n1(c1, a5, b2); + c2 = acc_expanded_m2n1(c2, a6, b2); + c3 = acc_expanded_m2n1(c3, a7, b2); + c4 = acc_expanded_m2n1(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); + store_expanded_m2n1(C + 8, c3, expanded_alpha); + store_expanded_m2n1(C + 12, c4, expanded_alpha); +} + +static inline void kernel_4x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b2); + c4 = acc_expanded_m2n1(c4, a4, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); +} + +static inline void kernel_2x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 3; K -= 4) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2), + b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b2); + c3 = acc_expanded_m2n1(c3, a3, b3); + c4 = acc_expanded_m2n1(c4, a4, b4); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + float32x4_t a1 = vld1q_f32(sa); sa += 4; + float32x2_t b1 = vld1_f32(sb); sb += 2; + c1 = acc_expanded_m2n1(c1, a1, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); +} + +static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) { + float32x2x4_t ret; + const float maskp[] = { -1, 1 }; + const float maskn[] = { 1, -1 }; + const float32x2_t vrevp = vld1_f32(maskp); + const float32x2_t vrevn = vld1_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(-alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevn); + ret.val[3] = vmul_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevp); + ret.val[3] = vmul_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevn); + ret.val[1] = vmul_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(-alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevp); + ret.val[1] = vmul_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc, + float32x2_t a, float32x2_t b) { + + acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m1n1(float *C, + float32x2x2_t acc, float32x2x4_t expanded_alpha) { + + float32x2_t ld1 = vld1_f32(C); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64_f32(acc.val[0]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64_f32(acc.val[1]); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1_f32(C, ld1); +} + +static inline float32x2x2_t init_expanded_m1n1() { + float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }}; + return ret; +} + +static inline void kernel_1x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K; K--) { + float32x2_t a1 = vld1_f32(sa); sa += 2; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6)); + sb += 8; + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c4, expanded_alpha); +} + +static inline void kernel_1x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 1; K -= 2) { + float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6)); + sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + if (K) { + float32x2_t a1 = vld1_f32(sa); + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); +} + +static inline void kernel_1x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 3; K -= 4) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6)); + sa += 8; sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + c1.val[0] = vadd_f32(c1.val[0], c2.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + sa += 2; sb += 2; + } + + store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai)); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 8; n_left -= 8) { + const FLOAT *a_ = sa; + FLOAT *c1_ = C; + FLOAT *c2_ = C + LDC * 8; + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 8; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 16 * K; + c1_ += 16; + c2_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 8 * K; + c1_ += 8; + c2_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 4 * K; + c1_ += 4; + c2_ += 4; + } + if (m_left) { + kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC); + } + C += 16 * LDC; + sb += 16 * K; + } + + if (n_left >= 4) { + n_left -= 4; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 8 * LDC; + sb += 8 * K; + } + + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 4 * LDC; + sb += 4 * K; + } + + if (n_left) { + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x1(sa, sb, C, alphar, alphai, K); + sa += 16 * K; + C += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x1(sa, sb, C, alphar, alphai, K); + sa += 8 * K; + C += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(sa, sb, C, alphar, alphai, K); + sa += 4 * K; + C += 4; + } + if (m_left) { + kernel_1x1(sa, sb, C, alphar, alphai, K); + } + } + return 0; +} + From c8d05aa7a5967745aa93c20740127063436bd663 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Dec 2021 08:34:52 +0100 Subject: [PATCH 581/681] Move the threads overflow flag under the protection of the local blas lock (#3476) * Move accesses to the overflow flag into the scope of the blas lock --- driver/others/memory.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 0a0b0eb3d..bd0553ca9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2871,32 +2871,28 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif + if (memory_overflowed) { -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif - do { - RMB; + + do { + RMB; #if defined(USE_OPENMP) - if (!newmemory[position-NUM_BUFFERS].used) { - blas_lock(&newmemory[position-NUM_BUFFERS].lock); + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); #endif - if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; #if defined(USE_OPENMP) - blas_unlock(&newmemory[position-NUM_BUFFERS].lock); - } + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } #endif - position ++; + position ++; - } while (position < 512+NUM_BUFFERS); + } while (position < 512+NUM_BUFFERS); + } #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif -} goto error; allocation : @@ -3001,6 +2997,9 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif if (memory_overflowed) goto terminate; fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); memory_overflowed=1; @@ -3014,7 +3013,6 @@ void *blas_memory_alloc(int procpos){ newmemory[i].used = 0; newmemory[i].lock = 0; } - newmemory[position-NUM_BUFFERS].used = 1; allocation2: newmemory[position-NUM_BUFFERS].used = 1; @@ -3086,6 +3084,9 @@ allocation2: return (void *)newmemory[position-NUM_BUFFERS].addr; terminate: +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); From a0cc119f2642778f63142514393db7f923a57445 Mon Sep 17 00:00:00 2001 From: Thomas De Schampheleire Date: Tue, 14 Dec 2021 23:36:16 +0100 Subject: [PATCH 582/681] Makefile: also consider -O, -Og and -Os when stripping flags gcc also supports -O, -Og and -Os as optimization flags. They may be given on the make command-line by users. For the calculation of LAPACK_NOOPT, all such flags should be considered. Signed-off-by: Thomas De Schampheleire [Retrieved from: https://git.buildroot.net/buildroot/tree/package/openblas/0003-Makefile-also-consider-Os-when-determining-LAPACK_NO.patch] Signed-off-by: Fabrice Fontaine --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 49fd57ff2..1bb3f6b90 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ export NOFORTRAN export NO_LAPACK endif -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test From 92b7b949dd491136314b1870bbf16a692a68a1a4 Mon Sep 17 00:00:00 2001 From: Wu Zhigang Date: Wed, 15 Dec 2021 00:22:19 -0800 Subject: [PATCH 583/681] fix bug in zscal function memset can not be used in zscal because of the stride parameters. Signed-off-by: Wu Zhigang --- kernel/riscv64/zscal_vector.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 02a76f168..64323aa3a 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -35,6 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVF_FLOAT vfmacc_vf_f32m4 #define VFMULVF_FLOAT vfmul_vf_f32m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 #else #define VSETVL(n) vsetvl_e64m4(n) #define VSETVL_MAX vsetvlmax_e64m1() @@ -44,6 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFMACCVF_FLOAT vfmacc_vf_f64m4 #define VFMULVF_FLOAT vfmul_vf_f64m4 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -58,7 +60,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F unsigned int gvl = 0; FLOAT_V_T vt, v0, v1; if(da_r == 0.0 && da_i == 0.0){ - memset(&x[0], 0, n * 2 * sizeof(FLOAT)); + gvl = VSETVL(n); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = inc_x * 2 * gvl; + vt = VFMVVF_FLOAT(0.0, gvl); + for(i=0,j=0; i < n/(gvl*2); i++){ + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl); + + j += gvl*2; + ix += inc_xv*2; + } + for(; j < n; ){ + gvl = VSETVL(n-j); + VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); + VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl); + j += gvl; + ix += inc_x * 2 * gvl; + } }else if(da_r == 0.0){ gvl = VSETVL(n); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); From f3b51ec6086837de4a9a4102354eaaaaa43342cd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Dec 2021 09:37:58 +0100 Subject: [PATCH 584/681] move brace inside the ifdef block --- cpuid_mips64.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 8d6a1d93d..97743bc43 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -104,17 +104,17 @@ int detect(void){ } } fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ - return CPU_LOONGSON3R3; - }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ - return CPU_LOONGSON3R4; - } else{ - return CPU_SICORTEX; + if (p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; + } } #endif return CPU_UNKNOWN; - } } char *get_corename(void){ From b31349c22ad86f8aff3da5a5915b8d20861a1cd6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Dec 2021 16:58:12 +0100 Subject: [PATCH 585/681] Open up delayed (re)init to non-Cygwin OS as well --- driver/others/blas_server_win32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 42f289441..874590a2b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -391,8 +391,9 @@ int blas_thread_init(void){ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ -#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) +#if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork + // on Cygwin or as delayed init when a static library is used if (unlikely(blas_server_avail == 0)) blas_thread_init(); #endif From 7a7fbb11c36db9dcc0348c9267a5b9957d85aeea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Dec 2021 17:28:28 +0100 Subject: [PATCH 586/681] define "unlikely" on non-cygwin too --- driver/others/blas_server_win32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 874590a2b..33b58f134 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -40,7 +40,7 @@ #include #include "common.h" -#if defined(OS_CYGWIN_NT) && !defined(unlikely) +#if !defined(unlikely) #ifdef __GNUC__ #define unlikely(x) __builtin_expect(!!(x), 0) #else From 6ed52576f8d68d1476329160ed32cc0ff46ec751 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Dec 2021 22:02:49 +0100 Subject: [PATCH 587/681] Add feature-based fallback for unknown x86_64 cpus --- driver/others/dynamic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 653915aab..b12fb069a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -1061,7 +1061,13 @@ void gotoblas_dynamic_init(void) { #ifdef ARCH_X86 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + if (gotoblas == NULL) { + if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; + else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; + else if (support_avx2()) gotoblas = &gotoblas_HASWELL; + else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; + else gotoblas = &gotoblas_PRESCOTT; + } /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ if (sizeof(void*) == 8) { if (gotoblas == &gotoblas_KATMAI || From ed430cd963c177a895777819a4df88c4215da092 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Dec 2021 21:56:26 +0100 Subject: [PATCH 588/681] Update -tp option for recent nvfortran on x86_64 --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index b8824fe51..97fdc3f91 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1122,8 +1122,12 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(ARCH), x86_64) +ifneq ($(NEWPGI2),1) FCOMMON_OPT += -tp p7-64 else +FCOMMON_OPT += -tp px +endif +else ifeq ($(ARCH), power) ifeq ($(CORE), POWER6) $(warning NVIDIA HPC compilers do not support POWER6.) From b6001a2ee342f7b8e2c7f8d92e3487b82653b3ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 14:34:14 +0100 Subject: [PATCH 589/681] Update with 0.3.19 changes --- Changelog.txt | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 59fe1d45e..180f7adec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,51 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.19 + 19-Dec-2021 + + general: + - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 + - fixed a potential thread race in the thread buffer reallocation routines + that were introduced in 0.3.18 + - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE + - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG + - made automatic library suffix for CMAKE builds with INTERFACE64 available + to CBLAS-only builds + +x86_64: + - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities + when an unknown CPUID is encountered, instead of defaulting to Prescott + - added cpu detection for Intel Alder Lake + - added cpu detection for Intel Sapphire Rapids + - added an optimized SBGEMM kernel for Sapphire Rapids + - fixed DYNAMIC_ARCH builds on OSX with CMAKE + - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX + - fixed missing thread initialization for static builds on Windows/MSVC + - fixed an excessive read in ZSYMV + +POWER: + - added support for POWER10 in big-endian mode + - added support for building with CMAKE + - added optimized SGEMM and DGEMM kernels for small matrix sizes + +ARMV8: + - added basic support and cputype detection for Fujitsu A64FX + - added a generic ARMV8SVE target + - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX + - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus + - fixed cpuid detection for Apple M1 and improved performance + - improved compiler flag setting in CMAKE builds + +RISCV64: + - fixed improper initialization in CSCAL/ZSCAL for strided access patterns + +MIPS: + - added a GENERIC target for MIPS32 + - added support for cross-compiling to MIPS32 on x86_64 using CMAKE + +MIPS64: + - fixed misdetection of MSA capability + ==================================================================== Version 0.3.18 02-Oct-2021 From 6025daca63a14af14d8ce7714c8abac953b26bf7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 16:32:04 +0100 Subject: [PATCH 590/681] Update version to 0.3.19 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e63f7e04c..6f2046a3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 18) +set(OpenBLAS_PATCH_VERSION 19) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 2480e5046e3b0120da8a7fd1442eca628df55f87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 20:55:57 +0100 Subject: [PATCH 591/681] Update version to 0.3.19 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 500b7c44f..3359860b9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.18.dev +VERSION = 0.3.19 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 894fde9bfe36fe1988b595d3529a7f808a5a6534 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 21:21:47 +0100 Subject: [PATCH 592/681] Update version to 0.3.19.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1d69da13..913017c63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 19) +set(OpenBLAS_PATCH_VERSION 19.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 8cec83bdfb82effda2075309af5ca36df79f1a8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 19 Dec 2021 21:22:19 +0100 Subject: [PATCH 593/681] Update version to 0.3.19.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 3359860b9..4b4b9bcf9 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.19 +VERSION = 0.3.19.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 8d9b9c6b2a6f015cafcf3e0e568874a1aabcc223 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 21 Dec 2021 09:22:59 +0800 Subject: [PATCH 594/681] loongarch64: Optimize dgemm_kernel --- kernel/loongarch64/KERNEL.LOONGSON3R5 | 15 +- kernel/loongarch64/dgemm_kernel_16x4.S | 4250 ++++++++++++++++++++++++ kernel/loongarch64/dgemm_ncopy_16.S | 691 ++++ kernel/loongarch64/dgemm_ncopy_4.S | 237 ++ kernel/loongarch64/dgemm_tcopy_16.S | 710 ++++ kernel/loongarch64/dgemm_tcopy_4.S | 270 ++ param.h | 10 +- 7 files changed, 6177 insertions(+), 6 deletions(-) create mode 100644 kernel/loongarch64/dgemm_kernel_16x4.S create mode 100644 kernel/loongarch64/dgemm_ncopy_16.S create mode 100644 kernel/loongarch64/dgemm_ncopy_4.S create mode 100644 kernel/loongarch64/dgemm_tcopy_16.S create mode 100644 kernel/loongarch64/dgemm_tcopy_4.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cce4093e3..bb0441ab2 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1 +1,14 @@ -#TODO: Add loongarch64 SIMD optimizations +DGEMMKERNEL = dgemm_kernel_16x4.S +DGEMMINCOPY = dgemm_ncopy_16.S +DGEMMITCOPY = dgemm_tcopy_16.S +DGEMMONCOPY = dgemm_ncopy_4.S +DGEMMOTCOPY = dgemm_tcopy_4.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S new file mode 100644 index 000000000..13faa977e --- /dev/null +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -0,0 +1,4250 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define T0 $r25 /* !! DO NOT USE $r21 and $r22 !! */ +#define T1 $r26 +#define T2 $r27 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define D0 $xr7 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr14 +#define D8 $xr15 +#define D9 $xr16 +#define D10 $xr17 +#define D11 $xr18 +#define D12 $xr19 +#define D13 $xr20 +#define D14 $xr21 +#define D15 $xr22 +#define VALPHA $xr23 + +/* Prefetch interval */ +#define A_PRE 0x200 +#define B_PRE 0x100 + + PROLOGUE + + addi.d $sp, $sp, -56 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST ALPHA, $sp, 48 + + /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ + xvld VALPHA, $sp, 48 + xvreplve0.d VALPHA, VALPHA + +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + xvldrepl.d U4, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U4 + xvfmul.d D11, U3, U4 + + xvldrepl.d U4, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + /* Calculate 8 sets of D0~D15 */ +.L_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 + +.L_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 + +.L_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M8_L71 + +.L_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ + +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + +.L_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M4_TL1 + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + +.L_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + +.L_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N3_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + +.L_N3_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N3_M0 + + andi I, M, 8 + beq ZERO,I, .L_N3_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +.L_N3_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 + +.L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M8_L0 + +.L_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M8_L71 + +.L_N3_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + +.L_N3_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +.L_N3_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + +.L_N3_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N1_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +.L_N1_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 + +.L_N1_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 + +.L_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 + +.L_N1_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 8) ) End************/ + +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +.L_N1_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +.L_N1_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +.L_N1_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f23, $sp, 40 + addi.d $sp, $sp, 56 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S new file mode 100644 index 000000000..95c879031 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_16.S @@ -0,0 +1,691 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + + PROLOGUE + + addi.d $sp, $sp, -0x90 + SDARG $r23, $sp, 0x00 + SDARG $r24, $sp, 0x08 + SDARG $r25, $sp, 0x10 + SDARG $r26, $sp, 0x18 + SDARG $r27, $sp, 0x20 + SDARG $r28, $sp, 0x28 + SDARG $r29, $sp, 0x30 + SDARG $r30, $sp, 0x38 + SDARG $r31, $sp, 0x40 + ST $f23, $sp, 0x48 + ST $f24, $sp, 0x50 + ST $f25, $sp, 0x58 + ST $f26, $sp, 0x60 + ST $f27, $sp, 0x68 + ST $f28, $sp, 0x70 + ST $f29, $sp, 0x78 + ST $f30, $sp, 0x80 + ST $f31, $sp, 0x88 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x04 + beq J, ZERO, .L_N8 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S9, S7, T0 + add.d S10, S8, T0 + add.d S11, S9, T0 + add.d S12, S10, T0 + add.d S13, S11, T0 + add.d S14, S12, T0 + add.d S15, S13, T0 + add.d S16, S14, T0 + add.d TS, S15, T0 + beq I, ZERO, .L_I7 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + xvld U8, S9, 0x20 + xvld U9, S10, 0x20 + xvld U10, S11, 0x20 + xvld U11, S12, 0x20 + xvld U12, S13, 0x20 + xvld U13, S14, 0x20 + xvld U14, S15, 0x20 + xvld U15, S16, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d S9, S9, 0x40 + addi.d S10, S10, 0x40 + addi.d S11, S11, 0x40 + addi.d S12, S12, 0x40 + addi.d S13, S13, 0x40 + addi.d S14, S14, 0x40 + addi.d S15, S15, 0x40 + addi.d S16, S16, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 + +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + fld.d F0, S9, 0x00 + fld.d F1, S10, 0x00 + fld.d F2, S11, 0x00 + fld.d F3, S12, 0x00 + fld.d F4, S13, 0x00 + fld.d F5, S14, 0x00 + fld.d F6, S15, 0x00 + fld.d F7, S16, 0x00 + + fst.d F0, TD, 0x00 + addi.d S9, S9, 0x08 + fst.d F1, TD, 0x08 + addi.d S10, S10, 0x08 + fst.d F2, TD, 0x10 + addi.d S11, S11, 0x08 + fst.d F3, TD, 0x18 + addi.d S12, S12, 0x08 + fst.d F4, TD, 0x20 + addi.d S13, S13, 0x08 + fst.d F5, TD, 0x28 + addi.d S14, S14, 0x08 + fst.d F6, TD, 0x30 + addi.d S15, S15, 0x08 + fst.d F7, TD, 0x38 + addi.d S16, S16, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N8: + andi J, N, 0x08 + beq ZERO, J, .L_N4 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_8I3 + +.L_8I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_8I1 + +.L_8I3: + andi I, M, 0x07 + beq I, ZERO, .L_N4 + +.L_8I11: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_8I11 + +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 + +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvpermi.q D0, D1, 0x02 // 0 + + xvst D0, TD, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 + +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 + +.L_N0: + LDARG $r23, $sp, 0x00 + LDARG $r24, $sp, 0x08 + LDARG $r25, $sp, 0x10 + LDARG $r26, $sp, 0x18 + LDARG $r27, $sp, 0x20 + LDARG $r28, $sp, 0x28 + LDARG $r29, $sp, 0x30 + LDARG $r30, $sp, 0x38 + LDARG $r31, $sp, 0x40 + LD $f23, $sp, 0x48 + LD $f24, $sp, 0x50 + LD $f25, $sp, 0x58 + LD $f26, $sp, 0x60 + LD $f27, $sp, 0x68 + LD $f28, $sp, 0x70 + LD $f29, $sp, 0x78 + LD $f30, $sp, 0x80 + LD $f31, $sp, 0x88 + addi.d $sp, $sp, 0x90 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S new file mode 100644 index 000000000..b1f322a06 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4.S @@ -0,0 +1,237 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr14 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 + +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvand.v U0, D0, D0 + xvpermi.q D0, D1, 0x02 // 0 + xvpermi.q D1, U0, 0x31 // 1 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_2I3: + andi I, M, 0x03 + beq ZERO, I, .L_N1 + +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 + +.L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 + + move S1, TS + srai.d I, M, 0x02 + beq ZERO, I, .L_1I3 + +.L_1I1: + xvld U0, S1, 0x00 + addi.d S1, S1, 0x20 + xvst U0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x20 + blt ZERO, I, .L_1I1 + +.L_1I3: + andi I, M, 0x03 + beq ZERO, I, .L_N0 + +.L_1II1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x08 + blt ZERO, I, .L_1II1 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S new file mode 100644 index 000000000..afafe5b37 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_16.S @@ -0,0 +1,710 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + fst.d F4, P5, 0x20 + fst.d F5, P5, 0x28 + fst.d F6, P5, 0x30 + fst.d F7, P5, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S new file mode 100644 index 000000000..700989ca1 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4.S @@ -0,0 +1,270 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P2, 0x00 + xvst U2, P2, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P2, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + xvld U0, S1, 0x00 + + xvst U0, P1, 0x00 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/param.h b/param.h index 8dd2a7461..2dffaae3c 100644 --- a/param.h +++ b/param.h @@ -2852,35 +2852,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_P sgemm_p -#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_P 32 #define QGEMM_DEFAULT_P qgemm_p #define CGEMM_DEFAULT_P cgemm_p #define ZGEMM_DEFAULT_P zgemm_p #define XGEMM_DEFAULT_P xgemm_p #define SGEMM_DEFAULT_R sgemm_r -#define DGEMM_DEFAULT_R dgemm_r +#define DGEMM_DEFAULT_R 858 #define QGEMM_DEFAULT_R qgemm_r #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r #define XGEMM_DEFAULT_R xgemm_r #define SGEMM_DEFAULT_Q 128 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 152 #define QGEMM_DEFAULT_Q 128 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 From e3c9947c0f4338abc437126283576b63a2203623 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 21 Dec 2021 11:19:27 +0100 Subject: [PATCH 595/681] prepare kernel for sve zgemm --- kernel/arm64/KERNEL.A64FX | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 80be4ddd0..04be0fab9 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -169,15 +169,24 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + From 07fe5b19a4957cafe3864e4af0296eb575a2e2f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 12:31:54 +0100 Subject: [PATCH 596/681] typecast function pointers --- driver/others/blas_server.c | 40 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index fa07a1ea4..ec79075fe 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*) + (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, - bfloat16 *, BLASLONG, void *) = func; + bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((bfloat16 *)args -> alpha)[0], @@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, - xdouble *, BLASLONG, void *) = func; + xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], @@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -425,7 +441,7 @@ blas_queue_t *tscq; #endif if (queue) { - int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); @@ -503,7 +519,7 @@ blas_queue_t *tscq; legacy_exec(routine, queue -> mode, queue -> args, sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); @@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "\n"); #endif - routine = queue -> routine; + routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, From d1ee6ff73fca6eecfb679d2a91c39ce91e80231b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:45:28 +0100 Subject: [PATCH 597/681] fix function typecasts --- kernel/x86_64/dasum.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/drot.c | 2 +- kernel/x86_64/sasum.c | 2 +- kernel/x86_64/srot.c | 2 +- kernel/x86_64/zdot.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 8af9e798b..a9c40f38f 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 5d0c32234..f3b9ee701 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); + (int (*)(void)) dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index ab5048bd1..40c9cf19d 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index a021741c7..37a92468f 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 587cf8e40..a49544616 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 50c8a2678..c52575d07 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)zdot_thread_function, nthreads); + (int (*)(void))zdot_thread_function, nthreads); ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { From 64365c919e63baaef31f5c52d39ae53d77a98c85 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:47:35 +0100 Subject: [PATCH 598/681] fix function typecasts --- interface/axpy.c | 2 +- interface/scal.c | 2 +- interface/zaxpy.c | 4 ++-- interface/zscal.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/interface/axpy.c b/interface/axpy.c index eaa19f4df..5304ebec3 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #endif blas_level1_thread(mode, n, 0, 0, &alpha, - x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); + x, incx, y, incy, NULL, 0, (int (*)(void))AXPYU_K, nthreads); } #endif diff --git a/interface/scal.c b/interface/scal.c index 6d07b1650..0a7fee640 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #else &alpha, #endif - x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index da3b48ead..0e168606d 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ - (void *)AXPYU_K, + (int (*)(void))AXPYU_K, #else - (void *)AXPYC_K, + (int (*)(void))AXPYC_K, #endif nthreads); } diff --git a/interface/zscal.c b/interface/zscal.c index bfaddc260..498377343 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif From c49d46f25f9c4f626f4a197b01bad749a9d5a7a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:49:18 +0100 Subject: [PATCH 599/681] fix function typecast --- lapack/getrf/getrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index fc410b0e7..fed5c1de5 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, - ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads); is += bk; } From aecb4a5e8daab1b50ae34636001dfdb234948765 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:50:22 +0100 Subject: [PATCH 600/681] fix function typecasts --- lapack/lauum/lauum_L_parallel.c | 4 ++-- lapack/lauum/lauum_U_parallel.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 0ebe3f069..1b32e4519 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index 7214c9731..f5ea54c88 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, - &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; From 6b407a16cb089492d3ad1e2a1f5fdb71f4ffdd94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:51:28 +0100 Subject: [PATCH 601/681] fix function typecasts --- lapack/potrf/potrf_L_parallel.c | 4 ++-- lapack/potrf/potrf_U_parallel.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 68ec8e22a..986816d1a 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); #endif } } diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 3b5d39511..cc6ff9912 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); #endif } } From 9809931eb46c483ff3e6ab301a262eb879072450 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 21 Dec 2021 18:53:55 +0100 Subject: [PATCH 602/681] clean up unused variables and unreachable statements --- cpuid_x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 72e95214e..6466bd148 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -323,9 +323,11 @@ int get_vendor(void){ int get_cputype(int gettype){ int eax, ebx, ecx, edx; +/* int extend_family, family; int extend_model, model; int type, stepping; +*/ int feature = 0; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ cpuid(0, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level > 1) { - int numcalls =0 ; + int numcalls; + cpuid(2, &eax, &ebx, &ecx, &edx); numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries info[ 0] = BITMASK(eax, 8, 0xff); @@ -1637,7 +1640,6 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } - break; case 10: // Zen3 if(support_avx()) #ifndef NO_AVX2 @@ -2193,7 +2195,6 @@ int get_coretype(void){ else return CORE_NEHALEM; #endif - break; case 7: if (model == 10) @@ -2582,4 +2583,4 @@ void get_sse(void){ if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } -//} \ No newline at end of file +//} From 2db0b2e4453b0a502cf336f6288688c23246d202 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 23 Dec 2021 20:04:27 +0800 Subject: [PATCH 603/681] Fixed MSA enabled optimization on Loongson-3A4000 --- cpuid_mips.c | 6 +++--- cpuid_mips64.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 1946455d8..d787e7120 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,7 +165,7 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } - if (!get_feature(msa)) printf("#define NO_MSA\n"); + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -193,7 +193,7 @@ int get_feature(char *search) while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Features", buffer, 8)) + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) { p = strchr(buffer, ':') + 2; break; @@ -207,7 +207,7 @@ int get_feature(char *search) t = strtok(p," "); while( t = strtok(NULL," ")) { - if (!strcmp(t, search)) { return(1); } + if (strstr(t, search)) { return(1); } } #endif diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 97743bc43..8753ee3f0 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -201,7 +201,7 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } - if (!get_feature(msa)) printf("#define NO_MSA\n"); + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -233,7 +233,7 @@ int get_feature(char *search) while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Features", buffer, 8)) + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) { p = strchr(buffer, ':') + 2; break; @@ -247,7 +247,7 @@ int get_feature(char *search) t = strtok(p," "); while( t = strtok(NULL," ")) { - if (!strcmp(t, search)) { return(1); } + if (strstr(t, search)) { return(1); } } #endif From e9a0e52201282ee1caec67475307aa7717b2bc31 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Dec 2021 20:00:50 +0100 Subject: [PATCH 604/681] fix function typecast --- kernel/x86_64/casum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index a1bd76f33..60feec0ce 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); From 7b146e590c1d93c62cb0a7590a3ca287bcde52c6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Dec 2021 20:01:52 +0100 Subject: [PATCH 605/681] fix function typecast --- kernel/x86_64/zasum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 6e758e2e3..80e95a2c8 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); From 683a7548bf34f610f5bdedfac5c1dac425c66a59 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 25 Dec 2021 11:46:41 +0100 Subject: [PATCH 606/681] added macros for sve zgemm kernels --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 1159 ++++++++++++++++++++++++++ 1 file changed, 1159 insertions(+) create mode 100644 kernel/arm64/zgemm_kernel_sve_v1x4.S diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..0fc966f8c --- /dev/null +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -0,0 +1,1159 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alphaR x19 +#define alphaI x20 + +#define alphaz_R z10.d +#define alphaz_I z11.d +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + ld2d {z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one + add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2d {z26.d, z27.d}, p1/z, [pCRow0] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2d {z28.d, z29.d}, p1/z, [pCRow1] + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld2d {z30.d, z31.d}, p1/z, [pCRow1] + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2d {z26.d, z27.d}, p1/z, [pCRow0] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z25.d, z26.d}, p1, [pCRow0] + + add pCRow0, pCRow0, #32 + + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lzgemm_kernel_L2_BEGIN + +.Lzgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lzgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble .Lzgemm_kernel_L4_M2_BEGIN + + .align 5 +.Lzgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lzgemm_kernel_L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lzgemm_kernel_L4_M4_22a + + .align 5 +.Lzgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M4_22 + + .align 5 +.Lzgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lzgemm_kernel_L4_M4_44 + + .align 5 +.Lzgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble .Lzgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b .Lzgemm_kernel_L4_M4_44 + + +.Lzgemm_kernel_L4_M4_40: + + INIT4x4 + +.Lzgemm_kernel_L4_M4_44: + + ands counterL , origK, #7 + ble .Lzgemm_kernel_L4_M4_100 + + .align 5 +.Lzgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bne .Lzgemm_kernel_L4_M4_46 + +.Lzgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVE4x4 + +.Lzgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne .Lzgemm_kernel_L4_M4_20 + +.Lzgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L4_M1_BEGIN + +.Lzgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L4_M2_40 + +.Lzgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M2_22 + + +.Lzgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L4_M2_100 + +.Lzgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M2_42 + +.Lzgemm_kernel_L4_M2_100: + + SAVE2x4 + +.Lzgemm_kernel_L4_M2_END: + + +.Lzgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L4_END + +.Lzgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L4_M1_40 + +.Lzgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M1_22 + + +.Lzgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L4_M1_100 + +.Lzgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_M1_42 + +.Lzgemm_kernel_L4_M1_100: + + SAVE1x4 + + +.Lzgemm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lzgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lzgemm_kernel_L999 + + tst counterJ , #2 + ble .Lzgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lzgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble .Lzgemm_kernel_L2_M2_BEGIN + +.Lzgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lzgemm_kernel_L2_M4_40 + .align 5 + +.Lzgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M4_22 + + +.Lzgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M4_100 + +.Lzgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M4_42 + +.Lzgemm_kernel_L2_M4_100: + + SAVE4x2 + +.Lzgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt .Lzgemm_kernel_L2_M4_20 + + +.Lzgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L2_M1_BEGIN + +.Lzgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lzgemm_kernel_L2_M2_40 + +.Lzgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M2_22 + + +.Lzgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M2_100 + +.Lzgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M2_42 + +.Lzgemm_kernel_L2_M2_100: + + SAVE2x2 + +.Lzgemm_kernel_L2_M2_END: + + +.Lzgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L2_END + +.Lzgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble .Lzgemm_kernel_L2_M1_40 + +.Lzgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M1_22 + + +.Lzgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_M1_100 + +.Lzgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_M1_42 + +.Lzgemm_kernel_L2_M1_100: + + SAVE1x2 + + +.Lzgemm_kernel_L2_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lzgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lzgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +.Lzgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble .Lzgemm_kernel_L1_M2_BEGIN + +.Lzgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M4_40 + .align 5 + +.Lzgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M4_22 + + +.Lzgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M4_100 + +.Lzgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M4_42 + +.Lzgemm_kernel_L1_M4_100: + + SAVE4x1 + +.Lzgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt .Lzgemm_kernel_L1_M4_20 + + +.Lzgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble .Lzgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble .Lzgemm_kernel_L1_M1_BEGIN + +.Lzgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M2_40 + +.Lzgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M2_22 + + +.Lzgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M2_100 + +.Lzgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M2_42 + +.Lzgemm_kernel_L1_M2_100: + + SAVE2x1 + +.Lzgemm_kernel_L1_M2_END: + + +.Lzgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble .Lzgemm_kernel_L1_END + +.Lzgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_M1_40 + +.Lzgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M1_22 + + +.Lzgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_M1_100 + +.Lzgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_M1_42 + +.Lzgemm_kernel_L1_M1_100: + + SAVE1x1 + + +.Lzgemm_kernel_L1_END: + + +.Lzgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 878064f39463631e0daf78395248083f1c8b251f Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 26 Dec 2021 08:44:05 +0100 Subject: [PATCH 607/681] sve zgemm kernel --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 542 +++++++-------------------- 1 file changed, 131 insertions(+), 411 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index 0fc966f8c..1201d6dac 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -48,6 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 +#define lanes x17 + #define alphaR x19 #define alphaI x20 @@ -168,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_I ld2d {z0.d, z1.d}, p1/z, [pA] - ld2d {z2.d, z3.d}, p1/z, [pA, lanes, lsl #4] // next one + ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 ld1rd z8.d, p0/z, [pB] @@ -561,17 +563,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [origPA] fmov alphaR, d0 + dup alphaz_R, alphaR fmov alphaI, d1 + dup alphaz_I, alphaI lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate mov pB, origPB +// Loop over N mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble .Lzgemm_kernel_L2_BEGIN +/******************************************************************************/ .Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -582,204 +589,112 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pA, origPA // pA = start of A array -.Lzgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_Mv1_BEGIN: - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI, #0 - ble .Lzgemm_kernel_L4_M2_BEGIN +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 -.Lzgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_Mv1_20: mov pB, origPB + INITv1x4 // fill with zeros + asr counterL , origK, #3 cmp counterL , #2 - blt .Lzgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_Mv1_32 - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 subs counterL, counterL, #2 // subtract 2 - ble .Lzgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_Mv1_22a .align 5 -.Lzgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_Mv1_22: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_Mv1_22 .align 5 -.Lzgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_Mv1_22a: - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E - b .Lzgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_Mv1_44 .align 5 -.Lzgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_Mv1_32: tst counterL, #1 - ble .Lzgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_Mv1_40 - KERNEL4x4_I - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_M2 - KERNEL4x4_M1 - KERNEL4x4_E + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E - b .Lzgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_Mv1_44 -.Lzgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_Mv1_40: - INIT4x4 + INITv1x4 -.Lzgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_Mv1_44: ands counterL , origK, #7 - ble .Lzgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_Mv1_100 .align 5 -.Lzgemm_kernel_L4_M4_46: - KERNEL4x4_SUB +.Lzgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB subs counterL, counterL, #1 - bne .Lzgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_Mv1_46 -.Lzgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_Mv1_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] - SAVE4x4 - -.Lzgemm_kernel_L4_M4_END: - subs counterI, counterI, #1 - bne .Lzgemm_kernel_L4_M4_20 - -.Lzgemm_kernel_L4_M2_BEGIN: - - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L4_END - - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L4_M1_BEGIN - -.Lzgemm_kernel_L4_M2_20: - - INIT2x4 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L4_M2_40 - -.Lzgemm_kernel_L4_M2_22: - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - KERNEL2x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M2_22 - - -.Lzgemm_kernel_L4_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L4_M2_100 - -.Lzgemm_kernel_L4_M2_42: - - KERNEL2x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M2_42 - -.Lzgemm_kernel_L4_M2_100: - - SAVE2x4 - -.Lzgemm_kernel_L4_M2_END: - - -.Lzgemm_kernel_L4_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L4_END - -.Lzgemm_kernel_L4_M1_20: - - INIT1x4 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L4_M1_40 - -.Lzgemm_kernel_L4_M1_22: - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - KERNEL1x4_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M1_22 - - -.Lzgemm_kernel_L4_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L4_M1_100 - -.Lzgemm_kernel_L4_M1_42: + SAVEv1x4 - KERNEL1x4_SUB +.Lzgemm_kernel_L4_Mv1_END: - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L4_M1_42 - -.Lzgemm_kernel_L4_M1_100: + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lzgemm_kernel_L4_Mv1_20 - SAVE1x4 .Lzgemm_kernel_L4_END: @@ -810,157 +725,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.Lzgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI,#0 - ble .Lzgemm_kernel_L2_M2_BEGIN -.Lzgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_Mv1_20: - INIT4x2 + INITv1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble .Lzgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_Mv1_40 .align 5 -.Lzgemm_kernel_L2_M4_22: - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB +.Lzgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB - KERNEL4x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_Mv1_22 -.Lzgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_Mv1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M4_100 - -.Lzgemm_kernel_L2_M4_42: - - KERNEL4x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M4_42 - -.Lzgemm_kernel_L2_M4_100: - - SAVE4x2 + ble .Lzgemm_kernel_L2_Mv1_100 -.Lzgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_Mv1_42: - subs counterI, counterI, #1 - bgt .Lzgemm_kernel_L2_M4_20 - - -.Lzgemm_kernel_L2_M2_BEGIN: - - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L2_END - - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L2_M1_BEGIN - -.Lzgemm_kernel_L2_M2_20: - - INIT2x2 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL,#0 - ble .Lzgemm_kernel_L2_M2_40 - -.Lzgemm_kernel_L2_M2_22: - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB - KERNEL2x2_SUB + KERNELv1x2_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_Mv1_42 +.Lzgemm_kernel_L2_Mv1_100: -.Lzgemm_kernel_L2_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M2_100 - -.Lzgemm_kernel_L2_M2_42: - - KERNEL2x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M2_42 + SAVEv1x2 -.Lzgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_Mv1_END: - SAVE2x2 -.Lzgemm_kernel_L2_M2_END: - - -.Lzgemm_kernel_L2_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L2_END - -.Lzgemm_kernel_L2_M1_20: - - INIT1x2 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL, #0 - ble .Lzgemm_kernel_L2_M1_40 - -.Lzgemm_kernel_L2_M1_22: - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - KERNEL1x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M1_22 - - -.Lzgemm_kernel_L2_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L2_M1_100 - -.Lzgemm_kernel_L2_M1_42: - - KERNEL1x2_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L2_M1_42 - -.Lzgemm_kernel_L2_M1_100: - - SAVE1x2 + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L2_Mv1_20 .Lzgemm_kernel_L2_END: @@ -981,163 +800,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov pA, origPA // pA = A +.Lzgemm_kernel_L1_Mv1_BEGIN: + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d -.Lzgemm_kernel_L1_M4_BEGIN: - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI, #0 - ble .Lzgemm_kernel_L1_M2_BEGIN +.Lzgemm_kernel_L1_Mv1_20: -.Lzgemm_kernel_L1_M4_20: - - INIT4x1 + INITv1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble .Lzgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_Mv1_40 .align 5 -.Lzgemm_kernel_L1_M4_22: - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB +.Lzgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB - KERNEL4x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_Mv1_22 -.Lzgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_Mv1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_Mv1_100 -.Lzgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_Mv1_42: - KERNEL4x1_SUB + KERNELv1x1_SUB subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M4_42 - -.Lzgemm_kernel_L1_M4_100: - - SAVE4x1 - -.Lzgemm_kernel_L1_M4_END: - - subs counterI, counterI, #1 - bgt .Lzgemm_kernel_L1_M4_20 - + bgt .Lzgemm_kernel_L1_Mv1_42 -.Lzgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_Mv1_100: - mov counterI, origM - tst counterI , #3 - ble .Lzgemm_kernel_L1_END + SAVEv1x1 - tst counterI, #2 // counterI = counterI / 2 - ble .Lzgemm_kernel_L1_M1_BEGIN - -.Lzgemm_kernel_L1_M2_20: - - INIT2x1 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L1_M2_40 - -.Lzgemm_kernel_L1_M2_22: - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - KERNEL2x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M2_22 - - -.Lzgemm_kernel_L1_M2_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M2_100 - -.Lzgemm_kernel_L1_M2_42: - - KERNEL2x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M2_42 - -.Lzgemm_kernel_L1_M2_100: - - SAVE2x1 - -.Lzgemm_kernel_L1_M2_END: - - -.Lzgemm_kernel_L1_M1_BEGIN: - - tst counterI, #1 // counterI = counterI % 2 - ble .Lzgemm_kernel_L1_END - -.Lzgemm_kernel_L1_M1_20: - - INIT1x1 - - mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble .Lzgemm_kernel_L1_M1_40 - -.Lzgemm_kernel_L1_M1_22: - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - KERNEL1x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M1_22 - - -.Lzgemm_kernel_L1_M1_40: - - ands counterL , origK, #7 // counterL = counterL % 8 - ble .Lzgemm_kernel_L1_M1_100 - -.Lzgemm_kernel_L1_M1_42: - - KERNEL1x1_SUB - - subs counterL, counterL, #1 - bgt .Lzgemm_kernel_L1_M1_42 - -.Lzgemm_kernel_L1_M1_100: - - SAVE1x1 +.Lzgemm_kernel_L1_Mv1_END: + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L1_Mv1_20 .Lzgemm_kernel_L1_END: +/******************************************************************************/ .Lzgemm_kernel_L999: mov x0, #0 // set return value From 6ec4aab8754b4c0fa5a6dd359fe56ee755e04ee3 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 26 Dec 2021 17:05:46 +0100 Subject: [PATCH 608/681] zgemm sve copy routines --- kernel/arm64/zgemm_ncopy_sve_v1.c | 80 +++++++++++++++++++++++++++++++ kernel/arm64/zgemm_tcopy_sve_v1.c | 77 +++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/arm64/zgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/zgemm_tcopy_sve_v1.c diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..be18e9708 --- /dev/null +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda * 2); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); + svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active; + } + aoffset += sve_size * lda * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..085e1fa40 --- /dev/null +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); + svst2_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += sve_size * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From 6cae44d4f7afc6352a6521e717eff80f0220aded Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Dec 2021 19:06:55 +0100 Subject: [PATCH 609/681] Ensure that the right xerbla gets included in OSX DYNAMIC_ARCH builds --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 913017c63..decd8cc2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_Fortran_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") else () set (CMAKE_C_CREATE_SHARED_LIBRARY "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") endif () endif() From 40b14e4957b9a5d9bbda30fc10aeeba485755f3c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 29 Dec 2021 11:42:04 +0100 Subject: [PATCH 610/681] fix zgemm kernel --- kernel/arm64/zgemm_kernel_sve_v1x4.S | 59 +++++++++++++--------------- kernel/arm64/zgemm_ncopy_sve_v1.c | 2 +- kernel/arm64/zgemm_tcopy_sve_v1.c | 2 +- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index 1201d6dac..d5b35775c 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -53,12 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alphaR x19 #define alphaI x20 -#define alphaz_R z10.d -#define alphaz_I z11.d -#define alpha0_R d10 -#define alphaV0_R v10.d[0] -#define alpha0_I d11 -#define alphaV0_I v11.d[0] +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 #define A_PRE_SIZE 2560 @@ -170,8 +168,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x4_I ld2d {z0.d, z1.d}, p1/z, [pA] - ld2d {z2.d, z3.d}, p1/z, [pA, #2, mul vl] // next one - add pA, pA, lanes, lsl #5 // pA += lanes*2*2*8 + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 ld1rd z8.d, p0/z, [pB] ld1rd z9.d, p0/z, [pB, 8] @@ -283,7 +282,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNELv1x4_M2 - ld2d {z2.d, z3.d}, p1/z, [pA] + ld2d {z0.d, z1.d}, p1/z, [pA] add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 OP_rr z16.d, p1/m, z2.d, z8.d @@ -396,39 +395,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2d {z28.d, z29.d}, p1/z, [pCRow1] + ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R fmls z28.d, p1/m, z21.d, alphaz_I fmla z29.d, p1/m, z20.d, alphaz_I fmla z29.d, p1/m, z21.d, alphaz_R - st2d {z28.d, z29.d}, p1, [pCRow1] + st2d {z28.d, z29.d}, p1, [pCRow2] - add pCRow1, pCRow1, #32 + add pCRow2, pCRow2, lanes, lsl #4 - ld2d {z30.d, z31.d}, p1/z, [pCRow1] + ld2d {z30.d, z31.d}, p1/z, [pCRow3] fmla z30.d, p1/m, z22.d, alphaz_R fmls z30.d, p1/m, z23.d, alphaz_I fmla z31.d, p1/m, z22.d, alphaz_I fmla z31.d, p1/m, z23.d, alphaz_R - st2d {z30.d, z31.d}, p1, [pCRow1] + st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] @@ -474,24 +473,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] + st2d {z24.d, z25.d}, p1, [pCRow0] - add pCRow0, pCRow0, #32 + add pCRow0, pCRow0, lanes, lsl #4 - ld2d {z26.d, z27.d}, p1/z, [pCRow0] + ld2d {z26.d, z27.d}, p1/z, [pCRow1] fmla z26.d, p1/m, z18.d, alphaz_R fmls z26.d, p1/m, z19.d, alphaz_I fmla z27.d, p1/m, z18.d, alphaz_I fmla z27.d, p1/m, z19.d, alphaz_R - st2d {z26.d, z27.d}, p1, [pCRow0] + st2d {z26.d, z27.d}, p1, [pCRow1] - add pCRow0, pCRow0, #32 + add pCRow1, pCRow1, lanes, lsl #4 prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - .endm /******************************************************************************/ @@ -526,10 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmls z24.d, p1/m, z17.d, alphaz_I fmla z25.d, p1/m, z16.d, alphaz_I fmla z25.d, p1/m, z17.d, alphaz_R - st2d {z25.d, z26.d}, p1, [pCRow0] - - add pCRow0, pCRow0, #32 - + st2d {z24.d, z25.d}, p1, [pCRow0] add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 @@ -718,6 +712,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC add pC,pC,LDC, lsl #1 diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index be18e9708..57035f4ff 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); aoffset1 += 2; - boffset += active; + boffset += active * 2; } aoffset += sve_size * lda * 2; diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index 085e1fa40..32f217d7a 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -65,7 +65,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset1 += lda * 2; boffset += active * 2; } - aoffset += sve_size * 2; + aoffset += active * 2; j += svcntd(); pg = svwhilelt_b64(j, n); From ea3db69faa99dc4f7ad6641c9590f77ace7d6b03 Mon Sep 17 00:00:00 2001 From: jgillis Date: Wed, 29 Dec 2021 22:50:20 +0100 Subject: [PATCH 611/681] Fix cmake crosscompilation for core2 target Missing HAVE_SSE* cmake variables cause cc.cmake to forget about `-msse*` flags --- cmake/prebuild.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 259d9c738..232a6cc35 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DLOCAL_BUFFER_SIZE\t16384\n" "#define CLOCAL_BUFFER_SIZE\t16384\n" "#define ZLOCAL_BUFFER_SIZE\t16384\n") + set(HAVE_SSE 1) + set(HAVE_SSE2 1) + set(HAVE_SSE3 1) + set(HAVE_SSSE3 1) set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) From f7b69128680323ae30ff5992c2ea9f7cc8db8973 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 30 Dec 2021 21:00:16 +0100 Subject: [PATCH 612/681] ztrmm sve copy kernels --- kernel/arm64/ztrmm_lncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++ kernel/arm64/ztrmm_ltcopy_sve_v1.c | 143 ++++++++++++++++++++++++++++ kernel/arm64/ztrmm_uncopy_sve_v1.c | 145 +++++++++++++++++++++++++++++ kernel/arm64/ztrmm_utcopy_sve_v1.c | 141 ++++++++++++++++++++++++++++ 4 files changed, 574 insertions(+) create mode 100644 kernel/arm64/ztrmm_lncopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_ltcopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_uncopy_sve_v1.c create mode 100644 kernel/arm64/ztrmm_utcopy_sve_v1.c diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c new file mode 100644 index 000000000..19c34ff41 --- /dev/null +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda*2); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda*2); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..c272db602 --- /dev/null +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + } +#endif + ao += n_active * lda * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + + return 0; +} diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c new file mode 100644 index 000000000..aaa217063 --- /dev/null +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda * 2); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda * 2); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + b[temp++] = *(ao+k*lda+j+1); + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c new file mode 100644 index 000000000..c3e1f1b42 --- /dev/null +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda * 2; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + b[temp++] = *(ao+j*lda+k+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} From b329e45288c2e7fc0ef15c4e8a7b3c8dfd74a930 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Jan 2022 00:46:23 +0100 Subject: [PATCH 613/681] Guard against omp_get_num_places returning zero --- driver/others/memory.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index bd0553ca9..0f4cbb24d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -232,11 +232,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; - + int ret; #if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; + #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -249,7 +249,8 @@ int get_num_procs(void) { #if defined(USE_OPENMP) #if _OPENMP >= 201511 - nums = omp_get_num_places(); + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; #endif return nums; #endif @@ -1800,11 +1801,12 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; - + int ret; + #if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; + #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -1818,7 +1820,8 @@ int get_num_procs(void) { #if defined(USE_OPENMP) /* if (omp_get_proc_bind() != omp_proc_bind_false) */ #if _OPENMP >= 201511 - nums = omp_get_num_places(); + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; #endif return nums; #endif From 0140373802db2d910baa92bc7b31dba076fc205b Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 2 Jan 2022 19:15:33 +0100 Subject: [PATCH 614/681] add sve ztrmm --- kernel/Makefile.L3 | 32 + kernel/arm64/KERNEL.A64FX | 12 +- kernel/arm64/ztrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++ 3 files changed, 1044 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/ztrmm_kernel_sve_v1x4.S diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d22bd46a5..da279b185 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1739,29 +1739,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRMMUNCOPY_M +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMLNCOPY_M +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMUTCOPY_M +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMLTCOPY_M +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 04be0fab9..986b7ab47 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -182,11 +182,11 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c -DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c -DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c -DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -DSYMMUCOPY_M = symm_ucopy_sve.c -DSYMMLCOPY_M = symm_lcopy_sve.c +ZSYMMUCOPY_M = symm_ucopy_sve.c +ZSYMMLCOPY_M = symm_lcopy_sve.c diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..1a81b4da0 --- /dev/null +++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR x19 +#define alphaI x20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #4 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + dup alphaz_R, alphaR + fmov alphaI, d1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lztrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lztrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lztrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lztrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lztrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lztrmm_kernel_L4_Mv1_22a + + .align 5 +.Lztrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L4_Mv1_22 + + .align 5 +.Lztrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + .align 5 +.Lztrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lztrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + +.Lztrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lztrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lztrmm_kernel_L4_Mv1_100 + + .align 5 +.Lztrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lztrmm_kernel_L4_Mv1_46 + +.Lztrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lztrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lztrmm_kernel_L4_Mv1_20 + + + +.Lztrmm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lztrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lztrmm_kernel_L999 + + tst counterJ , #2 + ble .Lztrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lztrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lztrmm_kernel_L2_Mv1_40 + .align 5 + +.Lztrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_22 + + +.Lztrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L2_Mv1_100 + +.Lztrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_42 + +.Lztrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lztrmm_kernel_L2_Mv1_END: + + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L2_Mv1_20 + + +.Lztrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lztrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lztrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lztrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , temp, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lztrmm_kernel_L1_Mv1_40 + .align 5 + +.Lztrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_22 + + +.Lztrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L1_Mv1_100 + +.Lztrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_42 + +.Lztrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +.Lztrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L1_Mv1_20 + +.Lztrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lztrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From ce329ab6869bd958cde05c1dcd39ce7c6bc02cd9 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 3 Jan 2022 15:56:05 +0100 Subject: [PATCH 615/681] add sve zhemm copy routines --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/zhemm_ltcopy_sve.c | 106 +++++++++++++++++++++++++++++++ kernel/arm64/zhemm_utcopy_sve.c | 107 ++++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/zhemm_ltcopy_sve.c create mode 100644 kernel/arm64/zhemm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 986b7ab47..ff5d3aa0e 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -187,6 +187,6 @@ ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -ZSYMMUCOPY_M = symm_ucopy_sve.c -ZSYMMLCOPY_M = symm_lcopy_sve.c +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c new file mode 100644 index 000000000..58e9ff589 --- /dev/null +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -0,0 +1,106 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c new file mode 100644 index 000000000..9ddbf6cbd --- /dev/null +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} From 68c414d3a6d9af7f8a686868feeddcd237977b05 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 4 Jan 2022 14:40:59 +0100 Subject: [PATCH 616/681] ztrmm sve copy functions --- kernel/arm64/ztrmm_lncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_ltcopy_sve_v1.c | 12 ++++++------ kernel/arm64/ztrmm_uncopy_sve_v1.c | 14 +++++++------- kernel/arm64/ztrmm_utcopy_sve_v1.c | 12 ++++++------ 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index 19c34ff41..d34f607ab 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda*2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda*2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X < posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -99,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -113,8 +113,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index c272db602..7f34c9857 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -101,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #else @@ -113,12 +113,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active; diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index aaa217063..7eb9452c9 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -53,11 +53,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON js = 0; FLOAT *ao; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda * 2); + svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda * 2); + svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -89,7 +89,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON i ++; } else if (X > posY) { - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -105,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ONE; b[temp++] = ZERO; for (int k = j+1; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #else @@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b[temp++] = ZERO; } for (int k = j; k < n_active; k++) { - b[temp++] = *(ao+k*lda+j); - b[temp++] = *(ao+k*lda+j+1); + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); } } #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index c3e1f1b42..60c8ff3b4 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -85,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svfloat32x2_t aj_vec = svld2(pn, ao); #endif svst2(pn, b, aj_vec); - ao += lda * 2; + ao += lda; b += n_active * 2; X ++; i ++; @@ -95,8 +95,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k < j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } b[temp++] = ONE; b[temp++] = ZERO; @@ -109,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int temp = 0; for (int j = 0; j < n_active; j++) { for (int k = 0 ; k <= j; k++) { - b[temp++] = *(ao+j*lda+k); - b[temp++] = *(ao+j*lda+k+1); + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); } for (int k = j+1; k < n_active; k++) { b[temp++] = ZERO; @@ -118,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } #endif - ao += n_active * lda * 2; + ao += n_active * lda; b += n_active*n_active * 2; X += n_active; i += n_active; From 2e2c02b762afd67fe3cfb49620ab9df721f1a8ea Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 4 Jan 2022 14:42:07 +0100 Subject: [PATCH 617/681] fix sve ztrmm kernel --- kernel/arm64/ztrmm_kernel_sve_v1x4.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S index 1a81b4da0..b71a3d39e 100644 --- a/kernel/arm64/ztrmm_kernel_sve_v1x4.S +++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S @@ -723,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif prfm PLDL1KEEP, [pA] @@ -856,7 +856,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif .Lztrmm_kernel_L2_Mv1_END: @@ -923,7 +923,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add tempK, tempOffset, #1 #endif - asr counterL , temp, #3 // counterL = counterL / 8 + asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 ble .Lztrmm_kernel_L1_Mv1_40 .align 5 @@ -972,7 +972,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, temp #endif #if defined(LEFT) - add tempOffset, tempOffset, #4 + add tempOffset, tempOffset, lanes #endif .Lztrmm_kernel_L1_Mv1_END: From 07fa6fa3b192f525f5bb8f36e7fc694095f53593 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 08:57:51 +0100 Subject: [PATCH 618/681] configure Makefile for sve --- kernel/Makefile.L3 | 84 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index da279b185..1c0931d96 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1691,29 +1691,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef CTRMMUNCOPY_M +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMLNCOPY_M +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMUTCOPY_M +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMLTCOPY_M +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1929,11 +1961,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef CSYMMUCOPY_M +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef CSYMMLCOPY_M +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1941,11 +1983,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef ZSYMMUCOPY_M +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef ZSYMMLCOPY_M +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1965,11 +2017,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef CHEMMUTCOPY_M +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef CHEMMLTCOPY_M +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ @@ -1977,11 +2039,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef ZHEMMUTCOPY_M +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef ZHEMMLTCOPY_M +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ From d30157d8914c812f97d1b4de7631ead7440b3d3e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:00:54 +0100 Subject: [PATCH 619/681] update configuration of kernels for A64FX and ARMV8SVE --- kernel/arm64/KERNEL.A64FX | 29 +++++++++++++------ kernel/arm64/KERNEL.ARMV8SVE | 54 +++++++++++++++++++++++++----------- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index ff5d3aa0e..76dda0c65 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -156,19 +156,30 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c DSYMMUCOPY_M = symm_ucopy_sve.c DSYMMLCOPY_M = symm_lcopy_sve.c -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = chemm_ltcopy_sve.c +CHEMMUTCOPY_M = chemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S @@ -190,3 +201,5 @@ ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c ZHEMMUTCOPY_M = zhemm_utcopy_sve.c +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 0364a929c..63dfde22f 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -156,28 +156,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c DSYMMUCOPY_M = symm_ucopy_sve.c DSYMMLCOPY_M = symm_lcopy_sve.c -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = chemm_ltcopy_sve.c +CHEMMUTCOPY_M = chemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c From 87537b8c553a3d79ae2123b36716cc22a20280b1 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:07:28 +0100 Subject: [PATCH 620/681] modify sve zgemmcopy kernels --- kernel/arm64/zgemm_ncopy_sve_v1.c | 3 +-- kernel/arm64/zgemm_tcopy_sve_v1.c | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 57035f4ff..8f9b4268a 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ IFLOAT *aoffset, *aoffset1, *boffset; svint64_t lda_vec = svindex_s64(0LL, lda * 2); - uint64_t sve_size = svcntd(); aoffset = a; boffset = b; @@ -67,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset1 += 2; boffset += active * 2; } - aoffset += sve_size * lda * 2; + aoffset += active * lda * 2; j += svcntd(); pg = svwhilelt_b64(j, n); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index 32f217d7a..c6e50bc1c 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -46,8 +46,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG j; IFLOAT *aoffset, *aoffset1, *boffset; - uint64_t sve_size = svcntd(); - aoffset = a; boffset = b; From 18102ae8c317c0e2ba371ecff2d35b72132976e3 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:09:18 +0100 Subject: [PATCH 621/681] add cgemm ctrmm sve kernels --- kernel/arm64/cgemm_kernel_sve_v1x4.S | 874 ++++++++++++++++++++++ kernel/arm64/ctrmm_kernel_sve_v1x4.S | 1006 ++++++++++++++++++++++++++ 2 files changed, 1880 insertions(+) create mode 100644 kernel/arm64/cgemm_kernel_sve_v1x4.S create mode 100644 kernel/arm64/ctrmm_kernel_sve_v1x4.S diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..38770f66b --- /dev/null +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s4 +#define alpha0_I s5 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2w {z28.s, z29.s}, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + ld2w {z30.s, z31.s}, p1/z, [pCRow3] + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lcgemm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lcgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lcgemm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lcgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lcgemm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lcgemm_kernel_L4_Mv1_22a + + .align 5 +.Lcgemm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L4_Mv1_22 + + .align 5 +.Lcgemm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + .align 5 +.Lcgemm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lcgemm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + +.Lcgemm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lcgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lcgemm_kernel_L4_Mv1_100 + + .align 5 +.Lcgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lcgemm_kernel_L4_Mv1_46 + +.Lcgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lcgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lcgemm_kernel_L4_Mv1_20 + + + +.Lcgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 4 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lcgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lcgemm_kernel_L999 + + tst counterJ , #2 + ble .Lcgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lcgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L2_Mv1_20: + + INITv1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lcgemm_kernel_L2_Mv1_40 + .align 5 + +.Lcgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_22 + + +.Lcgemm_kernel_L2_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L2_Mv1_100 + +.Lcgemm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_42 + +.Lcgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Lcgemm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L2_Mv1_20 + + +.Lcgemm_kernel_L2_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 4 * 2 + +/******************************************************************************/ + +.Lcgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lcgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +.Lcgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L1_Mv1_20: + + INITv1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lcgemm_kernel_L1_Mv1_40 + .align 5 + +.Lcgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_22 + + +.Lcgemm_kernel_L1_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L1_Mv1_100 + +.Lcgemm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_42 + +.Lcgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Lcgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L1_Mv1_20 + +.Lcgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lcgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..242968f63 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s6 +#define alpha0_I s7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lctrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lctrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lctrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lctrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lctrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lctrmm_kernel_L4_Mv1_22a + + .align 5 +.Lctrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L4_Mv1_22 + + .align 5 +.Lctrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + .align 5 +.Lctrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lctrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + +.Lctrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lctrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lctrmm_kernel_L4_Mv1_100 + + .align 5 +.Lctrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lctrmm_kernel_L4_Mv1_46 + +.Lctrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lctrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lctrmm_kernel_L4_Mv1_20 + + + +.Lctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lctrmm_kernel_L999 + + tst counterJ , #2 + ble .Lctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lctrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lctrmm_kernel_L2_Mv1_40 + .align 5 + +.Lctrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_22 + + +.Lctrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L2_Mv1_100 + +.Lctrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_42 + +.Lctrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L2_Mv1_20 + + +.Lctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lctrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lctrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lctrmm_kernel_L1_Mv1_40 + .align 5 + +.Lctrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_22 + + +.Lctrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L1_Mv1_100 + +.Lctrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_42 + +.Lctrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L1_Mv1_20 + +.Lctrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + From 39ab2197048efca92d059f919987571cd92a903c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 09:12:22 +0100 Subject: [PATCH 622/681] sve copy functions for cgemm chemm zsymm --- kernel/arm64/cgemm_ncopy_sve_v1.c | 79 ++++++++++++++++ kernel/arm64/cgemm_tcopy_sve_v1.c | 75 +++++++++++++++ kernel/arm64/chemm_ltcopy_sve.c | 107 +++++++++++++++++++++ kernel/arm64/chemm_utcopy_sve.c | 108 +++++++++++++++++++++ kernel/arm64/zsymm_lcopy_sve.c | 150 ++++++++++++++++++++++++++++++ kernel/arm64/zsymm_ucopy_sve.c | 150 ++++++++++++++++++++++++++++++ param.h | 6 +- 7 files changed, 673 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/cgemm_ncopy_sve_v1.c create mode 100644 kernel/arm64/cgemm_tcopy_sve_v1.c create mode 100644 kernel/arm64/chemm_ltcopy_sve.c create mode 100644 kernel/arm64/chemm_utcopy_sve.c create mode 100644 kernel/arm64/zsymm_lcopy_sve.c create mode 100644 kernel/arm64/zsymm_ucopy_sve.c diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..6aa44a8f6 --- /dev/null +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0, lda * 2); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); + svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active * 2; + } + aoffset += active * lda * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..748cd954e --- /dev/null +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); + svst2_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += active * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c new file mode 100644 index 000000000..40cf9ea31 --- /dev/null +++ b/kernel/arm64/chemm_ltcopy_sve.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c new file mode 100644 index 000000000..440acdb1b --- /dev/null +++ b/kernel/arm64/chemm_utcopy_sve.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c new file mode 100644 index 000000000..6f18aa956 --- /dev/null +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c new file mode 100644 index 000000000..6be48cdaf --- /dev/null +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/param.h b/param.h index 8dd2a7461..5d46991a2 100644 --- a/param.h +++ b/param.h @@ -3325,11 +3325,13 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define DGEMM_DEFAULT_UNROLL_MN 32 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 32 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 32 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 From 0c91d043ae8d2dba0c7d3eeb2f63d17d9776c7e9 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 14:36:39 +0100 Subject: [PATCH 623/681] adapt CMake for SVE --- kernel/CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 9849ddc93..717c1ea72 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -323,35 +323,61 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #hemm - GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}HEMMUTCOPY_M) + set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") + set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") +endif() + GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) # symm for c and z +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) From f33543d029199ee1bf0786e16ff0610a6711c726 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Wed, 5 Jan 2022 14:42:37 +0100 Subject: [PATCH 624/681] combine zchemm into single file --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.ARMV8SVE | 4 +- kernel/arm64/chemm_ltcopy_sve.c | 107 ------------------------------- kernel/arm64/chemm_utcopy_sve.c | 108 -------------------------------- kernel/arm64/zhemm_ltcopy_sve.c | 66 +++++++++++++++++++ kernel/arm64/zhemm_utcopy_sve.c | 65 +++++++++++++++++++ 6 files changed, 135 insertions(+), 219 deletions(-) delete mode 100644 kernel/arm64/chemm_ltcopy_sve.c delete mode 100644 kernel/arm64/chemm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 76dda0c65..d74f0592d 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -174,8 +174,8 @@ CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -CHEMMLTCOPY_M = chemm_ltcopy_sve.c -CHEMMUTCOPY_M = chemm_utcopy_sve.c +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c CSYMMUCOPY_M = zsymm_ucopy_sve.c CSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 63dfde22f..66de642a5 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -174,8 +174,8 @@ CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c -CHEMMLTCOPY_M = chemm_ltcopy_sve.c -CHEMMUTCOPY_M = chemm_utcopy_sve.c +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c CSYMMUCOPY_M = zsymm_ucopy_sve.c CSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/chemm_ltcopy_sve.c b/kernel/arm64/chemm_ltcopy_sve.c deleted file mode 100644 index 40cf9ea31..000000000 --- a/kernel/arm64/chemm_ltcopy_sve.c +++ /dev/null @@ -1,107 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - - int offset, i; - - lda *= 2; - - uint32_t sve_size = svcntw(); - svint32_t posY_vec = svdup_s32(posY); - svint32_t posX_vec = svdup_s32(posX); - svint32_t lda_vec = svdup_s32(lda); - svint32_t one_vec = svdup_s32(1); - - int32_t j = 0; - int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); - int32_t active = svcntp_b32(svptrue_b32(), pg); - svint32_t index_neg = svindex_s32(0, -1); - svint32_t index = svindex_s32(0, 1); - - do { - offset = posX - posY; - svint32_t vec_off = svdup_s32(offset); - svbool_t cmp = svcmpgt(pg, vec_off, index_neg); - - svint32_t temp = svadd_z(pg, posX_vec, index); - svint32_t temp1 = svmul_z(pg, temp, 2); - temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); - svint32_t temp2 = svmul_z(pg, temp, lda_vec); - temp2 = svmla_z(pg, temp2, posY_vec, 2); - svint32_t gat_ind = svsel(cmp, temp1, temp2); - - i = m; - while (i>0) { - svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); - svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); - - gat_ind = svadd_m(cmp, gat_ind, lda_vec); - gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); - if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); - data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); - } - - svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); - // dealing with ZERO separately - if (offset > -active && offset < 1) - b[ -2*offset + 1 ] = ZERO; - - b += active * 2; - offset --; - vec_off = svsub_z(pg, vec_off, one_vec); - cmp = svcmpgt(pg, vec_off, index_neg); - - i--; - } - - posX += sve_size; - posX_vec = svdup_s32(posX); - j += sve_size; - pg = svwhilelt_b32(j, N); - active = svcntp_b32(svptrue_b32(), pg); - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/chemm_utcopy_sve.c b/kernel/arm64/chemm_utcopy_sve.c deleted file mode 100644 index 440acdb1b..000000000 --- a/kernel/arm64/chemm_utcopy_sve.c +++ /dev/null @@ -1,108 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#include - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ - - int offset, i; - - lda *= 2; - - uint32_t sve_size = svcntw(); - svint32_t posY_vec = svdup_s32(posY); - svint32_t posX_vec = svdup_s32(posX); - svint32_t lda_vec = svdup_s32(lda); - svint32_t one_vec = svdup_s32(1); - - int32_t j = 0; - int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); - int32_t active = svcntp_b32(svptrue_b32(), pg); - svint32_t index_neg = svindex_s32(0, -1); - svint32_t index = svindex_s32(0, 1); - - do { - offset = posX - posY; - svint32_t vec_off = svdup_s32(offset); - svbool_t cmp = svcmpgt(pg, vec_off, index_neg); - - svint32_t temp = svadd_z(pg, posX_vec, index); - svint32_t temp1 = svmul_z(pg, temp, lda); - temp1 = svmla_z(pg, temp1, posY_vec, 2); - svint32_t temp2 = svmul_z(pg, temp, 2); - temp2 = svmla_z(pg, temp2, posY_vec, lda); - svint32_t gat_ind = svsel(cmp, temp1, temp2); - - i = m; - while (i>0) { - svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); - svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); - - gat_ind = svadd_m(cmp, gat_ind, 2); - gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); - data_vec_imag = svneg_z(pg, data_vec_imag); - if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); - data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); - } - - svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); - // dealing with ZERO separately - if (offset > -active && offset < 1) - b[ -2*offset + 1 ] = ZERO; - - b += active * 2; - offset --; - vec_off = svsub_z(pg, vec_off, one_vec); - cmp = svcmpgt(pg, vec_off, index_neg); - - i--; - } - - posX += sve_size; - posX_vec = svdup_s32(posX); - j += sve_size; - pg = svwhilelt_b32(j, N); - active = svcntp_b32(svptrue_b32(), pg); - } while (svptest_any(svptrue_b32(), pg)); - - return 0; -} diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 58e9ff589..37dbfe4e1 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -42,6 +42,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ +#if defined(DOUBLE) BLASLONG offset, i; lda *= 2; @@ -102,5 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + return 0; } diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 9ddbf6cbd..21e03b7be 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -42,6 +42,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ +#if defined(DOUBLE) BLASLONG offset, i; lda *= 2; @@ -102,6 +103,70 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON pg = svwhilelt_b64(j, n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); +#else + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif return 0; } From bb33446b409a388b05d918dd251efd4b445e6f47 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 6 Jan 2022 10:26:11 +0100 Subject: [PATCH 625/681] fix makefile.L3 --- kernel/Makefile.L3 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1c0931d96..2a10ac980 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1712,10 +1712,10 @@ $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif @@ -1726,10 +1726,10 @@ $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif @@ -1740,10 +1740,10 @@ $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ else -$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif From cbcea149f0ed0bf966dafb5bd5b6612945b54858 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Thu, 6 Jan 2022 10:29:35 +0100 Subject: [PATCH 626/681] update contributors --- CONTRIBUTORS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 39ec96246..879aaebe3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -201,3 +201,5 @@ In chronological order: * Bine Brank * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM + * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions + * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions From 19c8f615dc507e20aee724aedb572bdddc2cd497 Mon Sep 17 00:00:00 2001 From: Sunita Nadampalli Date: Fri, 7 Jan 2022 00:28:17 +0000 Subject: [PATCH 627/681] OpenBLAS: aarch64: Add neoverse-v1/n2 architecture specifics --- Makefile.arm64 | 60 +++++++++++ Makefile.system | 3 + TargetList.txt | 2 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 60 ++++++++++- cpuid_arm64.c | 44 +++++++- driver/others/dynamic_arm64.c | 2 + getarch.c | 37 ++++++- kernel/arm64/KERNEL.NEOVERSEN2 | 189 +++++++++++++++++++++++++++++++++ kernel/arm64/KERNEL.NEOVERSEV1 | 189 +++++++++++++++++++++++++++++++++ param.h | 58 ++++++++++ 11 files changed, 640 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/KERNEL.NEOVERSEN2 create mode 100644 kernel/arm64/KERNEL.NEOVERSEV1 diff --git a/Makefile.arm64 b/Makefile.arm64 index 801601030..2eade8d78 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -78,6 +78,66 @@ endif endif endif +# Use a72 tunings because Neoverse-V1 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEV1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +endif +else +CCOMMON_OPT += -march=armv8.4-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + +# Use a72 tunings because Neoverse-N2 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEN2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +endif +else +CCOMMON_OPT += -march=armv8.5-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + # Use a53 tunings because a55 is only available in GCC>=8.1 ifeq ($(CORE), CORTEXA55) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) diff --git a/Makefile.system b/Makefile.system index 97fdc3f91..9203f49cb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -374,6 +374,7 @@ else endif GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += NEOVERSEV1 +DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX diff --git a/TargetList.txt b/TargetList.txt index b02a011d5..97c8a8f06 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -93,6 +93,8 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +NEOVERSEV1 +NEOVERSEN2 CORTEXA55 EMAG8180 FALKOR diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d468eb60b..f4a135e82 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 259d9c738..5f12bb145 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -237,6 +237,61 @@ endif () set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "NEOVERSEN1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEV1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN2") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -246,13 +301,14 @@ endif () "#define L1_DATA_ASSOCIATIVE\t2\n" "#define L2_SIZE\t1048576\n\n" "#define L2_LINESIZE\t64\n" - "#define L2_ASSOCIATIVE\t16\n" - "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" "#define DTB_SIZE\t4096\n" "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" "#define HAVE_VFP\n" "#define HAVE_NEON\n" + "#define HAVE_SVE\n" "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 958e94abc..cc3a82815 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -43,6 +43,8 @@ size_t length64=sizeof(value64); #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 #define CPU_NEOVERSEN1 11 +#define CPU_NEOVERSEV1 16 +#define CPU_NEOVERSEN2 17 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -71,6 +73,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", + "NEOVERSEV1" + "NEOVERSEN2" "THUNDERX3T110", "VORTEX", "CORTEXA55", @@ -90,6 +94,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "vortex", "cortexa55", @@ -170,6 +176,10 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd40")) + return CPU_NEOVERSEV1; + else if (strstr(cpu_part, "0xd49")) + return CPU_NEOVERSEN2; else if (strstr(cpu_part, "0xd05")) return CPU_CORTEXA55; } @@ -338,11 +348,41 @@ void get_cpuconfig(void) printf("#define L1_DATA_ASSOCIATIVE 4\n"); printf("#define L2_SIZE 1048576\n"); printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); break; + case CPU_NEOVERSEV1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_NEOVERSEN2: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_FALKOR: printf("#define FALKOR\n"); printf("#define L1_CODE_SIZE 65536\n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 04ceaaf6d..45ea9f113 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -147,6 +147,8 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", "cortexa55", "unknown" diff --git a/getarch.c b/getarch.c index 6063a2a1d..73bbf1892 100644 --- a/getarch.c +++ b/getarch.c @@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ - "-march=armv8.2-a -mtune=cortex-a72" + "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" #else #endif +#ifdef FORCE_NEOVERSEV1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEV1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEV1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.4-a -mtune=neoverse-v1" +#define LIBNAME "neoversev1" +#define CORENAME "NEOVERSEV1" +#else +#endif + + +#ifdef FORCE_NEOVERSEN2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN2 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.5-a -mtune=neoverse-n2" +#define LIBNAME "neoversen2" +#define CORENAME "NEOVERSEN2" +#else +#endif + #ifdef FORCE_CORTEXA55 #define FORCE #define ARCHITECTURE "ARM64" diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/param.h b/param.h index 8dd2a7461..eb4dcb8f0 100644 --- a/param.h +++ b/param.h @@ -3307,6 +3307,64 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(NEOVERSEV1) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#elif defined(NEOVERSEN2) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(ARMV8SVE) || defined(A64FX) /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". From 15d4b379138b9a5b84a2fbc2d37cb47b33efdeec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:48:13 +0100 Subject: [PATCH 628/681] SkylakeX: match parameters to dgemm kernels for dyn/non-dyn --- param.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/param.h b/param.h index 2dffaae3c..4155131f0 100644 --- a/param.h +++ b/param.h @@ -1669,10 +1669,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 -#ifndef DYNAMIC_ARCH -#define DGEMM_DEFAULT_UNROLL_M 16 -#else +#ifdef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_M 4 +#else +#define DGEMM_DEFAULT_UNROLL_M 16 #endif #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -1680,10 +1680,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#ifndef DYNAMIC_ARCH -#define DGEMM_DEFAULT_UNROLL_N 2 -#else +#ifdef DYNAMIC_ARCH #define DGEMM_DEFAULT_UNROLL_N 8 +#else +#define DGEMM_DEFAULT_UNROLL_N 2 #endif #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -1718,17 +1718,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_P 192 +#else +#define DGEMM_DEFAULT_P 384 +#endif #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_Q 384 +#else +#define DGEMM_DEFAULT_Q 168 +#endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_R 8640 +#else +#define DGEMM_DEFAULT_R 13824 +#endif #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r From f1ac59f20057cefe4dd45122954e2403f1330835 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:48:58 +0100 Subject: [PATCH 629/681] Forward DYNAMIC_ARCH option to Makefile.prebuild --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 97fdc3f91..7909f677a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf From 2573ccfb2e02abec3f537479d65b58c4d6e746f3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Jan 2022 23:50:34 +0100 Subject: [PATCH 630/681] make DYNAMIC_ARCH option available to getarch_2nd/param.h --- Makefile.prebuild | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.prebuild b/Makefile.prebuild index d6395da7b..399db956f 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -3,6 +3,10 @@ export BINARY export USE_OPENMP +ifdef DYNAMIC_ARCH +override HOST_CFLAGS += -DDYNAMIC_ARCH +endif + ifdef TARGET_CORE TARGET_MAKE = Makefile_kernel.conf TARGET_CONF = config_kernel.h From be7e55880c91d626a667aff699447c3ba5ab280e Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 9 Jan 2022 19:40:04 +0100 Subject: [PATCH 631/681] sve trsm_kernel_LN --- kernel/arm64/trsm_kernel_LN_sve.c | 301 ++++++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 kernel/arm64/trsm_kernel_LN_sve.c diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c new file mode 100644 index 000000000..8ca10036b --- /dev/null +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + i = sve_size; + if (i <= m) { + aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; + cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + sve_size * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + i = sve_size; + if (i <= m) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * j * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} From 098672b51b0c3a903be4be951ff60741cba43664 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 9 Jan 2022 20:11:47 +0100 Subject: [PATCH 632/681] add trsm_kernel_LT_sve --- kernel/arm64/trsm_kernel_LN_sve.c | 21 ++- kernel/arm64/trsm_kernel_LT_sve.c | 290 ++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/trsm_kernel_LT_sve.c diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index 8ca10036b..c29c3b57a 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -47,9 +47,22 @@ static FLOAT dm1 = -1.; #define GEMM_KERNEL GEMM_KERNEL_N #endif -#if GEMM_DEFAULT_UNROLL_M == 16 -#define GEMM_UNROLL_M_SHIFT 4 +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 #endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + #if GEMM_DEFAULT_UNROLL_N == 16 #define GEMM_UNROLL_N_SHIFT 4 #endif @@ -262,8 +275,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = sve_size; if (i <= m) { - aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; - cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; + cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; do { if (k - kk > 0) { diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c new file mode 100644 index 000000000..a35696836 --- /dev/null +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = sve_size % m; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} From a9e297e4764faa53b146de1b0c3ed82e2632e42c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Jan 2022 23:31:59 +0100 Subject: [PATCH 633/681] Fix handling of ifdef/ifndef --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index c5ee65384..56c1cb060 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") set (ElseSeen 0) - if (DEFINED ${CMAKE_MATCH_2}) + if (${CMAKE_MATCH_2}) if (${CMAKE_MATCH_1} STREQUAL "ifdef") #message (STATUS "condition is true") set (IfElse 1) From e8939b3d30e090b162303fcfbec2e7479a98ca6c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 10 Jan 2022 20:42:20 +0100 Subject: [PATCH 634/681] sve trsmRN and trsmRT --- kernel/arm64/trsm_kernel_LT_sve.c | 1 + kernel/arm64/trsm_kernel_RN_sve.c | 289 +++++++++++++++++++++++++++ kernel/arm64/trsm_kernel_RT_sve.c | 313 ++++++++++++++++++++++++++++++ 3 files changed, 603 insertions(+) create mode 100644 kernel/arm64/trsm_kernel_RN_sve.c create mode 100644 kernel/arm64/trsm_kernel_RT_sve.c diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index a35696836..7f5459702 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -37,6 +37,7 @@ /*********************************************************************/ #include "common.h" +#include "arm_sve.h" static FLOAT dm1 = -1.; diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c new file mode 100644 index 000000000..2f6611c1c --- /dev/null +++ b/kernel/arm64/trsm_kernel_RN_sve.c @@ -0,0 +1,289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = sve_size; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c new file mode 100644 index 000000000..d93ebe7ad --- /dev/null +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + int sve_size = svcntd(); + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - j) * sve_size * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + From f87468ac916c7a64a9d8256bb6b81a36245f3bae Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 10 Jan 2022 21:45:37 +0100 Subject: [PATCH 635/681] trsm_lncopy_sve --- kernel/arm64/trsm_lncopy_sve.c | 114 +++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 kernel/arm64/trsm_lncopy_sve.c diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c new file mode 100644 index 000000000..d96a1f383 --- /dev/null +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(a + k * lda + j); + } + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + } + } + + if (ii > jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao++; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From 8071e179f1ba0c65da0841cc533d0f8d6b15c6ef Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 11 Jan 2022 21:16:38 +0100 Subject: [PATCH 636/681] add remaining sve trsm copy kernels --- kernel/arm64/trsm_ltcopy_sve.c | 114 +++++++++++++++++++++++++++++++++ kernel/arm64/trsm_uncopy_sve.c | 113 ++++++++++++++++++++++++++++++++ kernel/arm64/trsm_utcopy_sve.c | 114 +++++++++++++++++++++++++++++++++ 3 files changed, 341 insertions(+) create mode 100644 kernel/arm64/trsm_ltcopy_sve.c create mode 100644 kernel/arm64/trsm_uncopy_sve.c create mode 100644 kernel/arm64/trsm_utcopy_sve.c diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c new file mode 100644 index 000000000..9012f7fe5 --- /dev/null +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(a + j * lda + k); + } + } + } + + if (ii < jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c new file mode 100644 index 000000000..242e99f60 --- /dev/null +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -0,0 +1,113 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(a + k * lda + j); + } + } + } + + if (ii < jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); + svst1(pn, b, aj_vec); + ao++; + } + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c new file mode 100644 index 000000000..9eefb8c18 --- /dev/null +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -0,0 +1,114 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT *ao; + + jj = offset; + int js = 0; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(a + j * lda + k); + } + *(b + j * n_active + j) = INV(*(a + j * lda + j)); + } + } + + if (ii > jj) { + for (int j = 0; j < n_active; j++) { + svfloat64_t aj_vec = svld1(pn, ao); + svst1(pn, b, aj_vec); + ao += lda; + } + + } + + b += n_active * n_active; + + i += n_active; + ii += n_active; + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From aaa2b1a861623eb012288c2b401fa923933da55c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 15 Jan 2022 21:02:14 +0100 Subject: [PATCH 637/681] fix sve dtrsm kernels --- kernel/arm64/trsm_kernel_LN_sve.c | 20 ++++++++++-------- kernel/arm64/trsm_kernel_LT_sve.c | 2 +- kernel/arm64/trsm_kernel_RT_sve.c | 12 +++++------ kernel/arm64/trsm_lncopy_sve.c | 30 +++++++++++++-------------- kernel/arm64/trsm_ltcopy_sve.c | 32 ++++++++++++++--------------- kernel/arm64/trsm_uncopy_sve.c | 29 +++++++++++++------------- kernel/arm64/trsm_utcopy_sve.c | 34 +++++++++++++++---------------- 7 files changed, 79 insertions(+), 80 deletions(-) diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index c29c3b57a..57f79ac3a 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -182,8 +182,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = m % sve_size; if (i) { - aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; - cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, @@ -205,10 +205,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, } + int mod = i; i = sve_size; if (i <= m) { - aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; - cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; do { if (k - kk > 0) { @@ -217,7 +218,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, ZERO, #endif aa + sve_size * kk * COMPSIZE, - b + sve_size * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } @@ -251,8 +252,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i = m % sve_size; if (i) { - aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; - cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; if (k - kk > 0) { GEMM_KERNEL(i, j, k - kk, dm1, @@ -273,10 +274,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, } + int mod = i; i = sve_size; if (i <= m) { - aa = a + ((m & ~(sve_size - 1)) - sve_size) * k * COMPSIZE; - cc = c + ((m & ~(sve_size - 1)) - sve_size) * COMPSIZE; + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; do { if (k - kk > 0) { diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index 7f5459702..8c6a57a6d 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -257,7 +257,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, i += sve_size; } - i = sve_size % m; + i = m % sve_size; if (i) { if (kk > 0) { GEMM_KERNEL(i, j, kk, dm1, diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c index d93ebe7ad..efafc9d11 100644 --- a/kernel/arm64/trsm_kernel_RT_sve.c +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -258,23 +258,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, if (i <= m) { do { if (k - kk > 0) { - GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, #ifdef COMPLEX ZERO, #endif - aa + GEMM_UNROLL_M * kk * COMPSIZE, + aa + sve_size * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc); } - solve(GEMM_UNROLL_M, GEMM_UNROLL_N, - aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); - aa += GEMM_UNROLL_M * k * COMPSIZE; - cc += GEMM_UNROLL_M * COMPSIZE; + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; i += sve_size; } while (i <= m); } diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index d96a1f383..7f480dcad 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -48,17 +48,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE + int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t js = 0; svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); @@ -74,25 +75,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { for (int k = 0; k < j; k++) { - *(b + j * n_active + k) = *(a + k * lda + j); + *(b + j * n_active + k) = *(ao + k * lda + j); } - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - } - - if (ii > jj) { - for (int j = 0; j < n_active; j++) { + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); - ao++; } - + ao++; + b += n_active; + i++; + ii++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index 9012f7fe5..d7b2a4e8d 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -48,18 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda); + int64_t js = 0; svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda); + int32_t js = 0; svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -73,26 +72,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); for (int k = j+1; k < n_active; k++) { - *(b + j * n_active + k) = *(a + j * lda + k); + *(b + j * n_active + k) = *(ao + j * lda + k); } } - } - - if (ii < jj) { - for (int j = 0; j < n_active; j++) { + b += n_active * n_active; + ao += lda * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { svfloat64_t aj_vec = svld1(pn, ao); svst1(pn, b, aj_vec); - ao += lda; } - + ao += lda; + b += n_active; + i ++; + ii ++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 242e99f60..b2851452b 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -48,17 +48,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE + int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t js = 0; svint32_t index = svindex_s32(0, lda); svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); @@ -73,25 +74,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); for (int k = j+1; k < n_active; k++) { - *(b + j * n_active + k) = *(a + k * lda + j); + *(b + j * n_active + k) = *(ao + k * lda + j); } } - } - - if (ii < jj) { - for (int j = 0; j < n_active; j++) { + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); - ao++; } + ao++; + b += n_active; + i++; + ii++; } - - b += n_active * n_active; - - i += n_active; - ii += n_active; } while (i < m); diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 9eefb8c18..558955801 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -48,18 +48,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ - BLASLONG i, ii, j, jj; + BLASLONG i, ii, jj; FLOAT *ao; jj = offset; - int js = 0; #ifdef DOUBLE - svint64_t index = svindex_s64(0LL, lda); + int64_t js = 0; svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svint32_t index = svindex_s32(0, lda); + int32_t js = 0; svbool_t pn = svwhilelt_b32(js, n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif @@ -74,25 +73,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT if (ii == jj) { for (int j = 0; j < n_active; j++) { for (int k = 0; k < j; k++) { - *(b + j * n_active + k) = *(a + j * lda + k); + *(b + j * n_active + k) = *(ao + j * lda + k); } - *(b + j * n_active + j) = INV(*(a + j * lda + j)); + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - } - - if (ii > jj) { - for (int j = 0; j < n_active; j++) { + ao += lda * n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { svfloat64_t aj_vec = svld1(pn, ao); svst1(pn, b, aj_vec); - ao += lda; } - - } - - b += n_active * n_active; - - i += n_active; - ii += n_active; + ao += lda; + b += n_active; + i ++; + ii ++; + } } while (i < m); From f1315288a8d9f4e06da7b7ccb9a37f04ded95c5f Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sat, 15 Jan 2022 22:27:25 +0100 Subject: [PATCH 638/681] add sve ztrsm --- kernel/arm64/KERNEL.A64FX | 49 +++++++----- kernel/arm64/trsm_kernel_LN_sve.c | 4 + kernel/arm64/trsm_kernel_LT_sve.c | 4 + kernel/arm64/trsm_kernel_RN_sve.c | 4 + kernel/arm64/trsm_kernel_RT_sve.c | 4 + kernel/arm64/trsm_lncopy_sve.c | 9 ++- kernel/arm64/trsm_ltcopy_sve.c | 9 ++- kernel/arm64/trsm_uncopy_sve.c | 9 ++- kernel/arm64/trsm_utcopy_sve.c | 9 ++- kernel/arm64/ztrsm_lncopy_sve.c | 119 ++++++++++++++++++++++++++++++ kernel/arm64/ztrsm_ltcopy_sve.c | 115 +++++++++++++++++++++++++++++ kernel/arm64/ztrsm_uncopy_sve.c | 119 ++++++++++++++++++++++++++++++ kernel/arm64/ztrsm_utcopy_sve.c | 115 +++++++++++++++++++++++++++++ 13 files changed, 542 insertions(+), 27 deletions(-) create mode 100644 kernel/arm64/ztrsm_lncopy_sve.c create mode 100644 kernel/arm64/ztrsm_ltcopy_sve.c create mode 100644 kernel/arm64/ztrsm_uncopy_sve.c create mode 100644 kernel/arm64/ztrsm_utcopy_sve.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index d74f0592d..bd25f7cd8 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c index 57f79ac3a..fa1c6e984 100644 --- a/kernel/arm64/trsm_kernel_LN_sve.c +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -167,7 +167,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c index 8c6a57a6d..2cbb2aafb 100644 --- a/kernel/arm64/trsm_kernel_LT_sve.c +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c index 2f6611c1c..5e4e8d9b1 100644 --- a/kernel/arm64/trsm_kernel_RN_sve.c +++ b/kernel/arm64/trsm_kernel_RN_sve.c @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *aa, *cc; BLASLONG kk; BLASLONG i, j, jj; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c index efafc9d11..c376c0e33 100644 --- a/kernel/arm64/trsm_kernel_RT_sve.c +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -169,7 +169,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, BLASLONG i, j; FLOAT *aa, *cc; BLASLONG kk; +#ifdef DOUBLE int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif #if 0 fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index 7f480dcad..5a9d4194a 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii > jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); } ao++; @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index d7b2a4e8d..ac4019e26 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii < jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); } ao += lda; @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index b2851452b..8fdcd0f4b 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii < jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif svst1(pn, b, aj_vec); } ao++; @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 558955801..0f5f0dccd 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT svbool_t pn = svwhilelt_b64(js, n); int n_active = svcntp_b64(svptrue_b64(), pn); #else + int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32(js, N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT ii += n_active; } else { if (ii > jj) { +#ifdef DOUBLE svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif svst1(pn, b, aj_vec); } ao += lda; @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32(js, N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c new file mode 100644 index 000000000..eb7cd0294 --- /dev/null +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c new file mode 100644 index 000000000..27cd1a941 --- /dev/null +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + } + b += n_active * n_active * 2; + ao += lda * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c new file mode 100644 index 000000000..92e086b75 --- /dev/null +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c new file mode 100644 index 000000000..d82a9d0c8 --- /dev/null +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += lda * n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} From 0fb6cc07bf9fdf0cbe7a7595e82379a0040d9e9a Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 16 Jan 2022 21:39:57 +0100 Subject: [PATCH 639/681] fix ztrsm lt/ut copy --- kernel/arm64/ztrsm_ltcopy_sve.c | 2 +- kernel/arm64/ztrsm_utcopy_sve.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 27cd1a941..34dbf8a30 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -77,7 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT } } b += n_active * n_active * 2; - ao += lda * n_active * 2; + ao += lda * n_active; i += n_active; ii += n_active; } else { diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index d82a9d0c8..ccb942e1b 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -76,7 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); } - ao += lda * n_active * 2; + ao += lda * n_active; b += n_active * n_active * 2; i += n_active; ii += n_active; From b6a445cfd88ab0bfa1687aeba7cc2d6705497f77 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 16 Jan 2022 21:40:56 +0100 Subject: [PATCH 640/681] adapt Makefile for SVE trsm --- kernel/Makefile.L3 | 128 +++++++++++++++++++++++++++++++++++++++++++++ param.h | 4 +- 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2a10ac980..2d5740183 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -2391,29 +2391,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLN_M +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYUT_M +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLT_M +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2439,29 +2471,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLN_M +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYUT_M +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLT_M +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2535,29 +2599,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLN_M +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYUT_M +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLT_M +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2583,29 +2679,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLN_M +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYUT_M +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLT_M +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ diff --git a/param.h b/param.h index 5d46991a2..ab6eab6eb 100644 --- a/param.h +++ b/param.h @@ -3327,11 +3327,11 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_MN 32 +#define CGEMM_DEFAULT_UNROLL_MN 16 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_MN 32 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 From 1b49ef8dcf6b01aecef30f804654d0efc97bc37a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:05:33 +0100 Subject: [PATCH 641/681] Fix pivot index for negative increments --- lapack/laswp/generic/laswp_k_1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index 88648cf29..556889291 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 0e9e9513067665ec0a505ed935c89752a60dbb81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:06:41 +0100 Subject: [PATCH 642/681] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 93b9a2c01..f76cd078f 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From eca2f50b48a9941e2f3d2cd75fb699ace070f9cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:07:33 +0100 Subject: [PATCH 643/681] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index 191a229a9..6520ed799 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From afa0cece5cbca7ce9c749b3101ac36b15518508e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:08:20 +0100 Subject: [PATCH 644/681] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/laswp_k_8.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index 947941839..a7bf06817 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 3b6293f5a0e4371d81074ee0ebc19d173ca696ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:09:14 +0100 Subject: [PATCH 645/681] Fix offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index d1204778a..42aaed528 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 57e2a72f40aaf008c48b7f0ec6e5216aee1499c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:10:21 +0100 Subject: [PATCH 646/681] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index c18ab4bee..1220870f8 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From 40003f8edb9e5c529c1c12589f9e1b53f9ac8f2d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Jan 2022 00:11:18 +0100 Subject: [PATCH 647/681] Fix pivot offset calculation for negative incx --- lapack/laswp/generic/zlaswp_k_4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 45e1bf01e..cc7e296e1 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -69,10 +69,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; From f158d59087c518fa924023d62a00eac176678dae Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Mon, 17 Jan 2022 22:36:48 +0100 Subject: [PATCH 648/681] adapt CMake --- kernel/CMakeLists.txt | 56 ++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 717c1ea72..8aa6728d5 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -381,23 +381,35 @@ endif () GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED ZTRSMCOPYLN_M) + set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") + set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") + set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") + set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) @@ -491,23 +503,35 @@ endif () GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED TRSMCOPYLN_M) + set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") + set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") + set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") + set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) From 19d435b1b3a5d0d5719189ba29b13e728a2bb41c Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Tue, 18 Jan 2022 08:28:31 +0100 Subject: [PATCH 649/681] update armv8sve + contributors --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.ARMV8SVE | 53 ++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 879aaebe3..5378c79bf 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -203,3 +203,4 @@ In chronological order: * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions + * [2022-01-18] SVE kernels and copy functions for TRSM diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 66de642a5..bd25f7cd8 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + SAMAXKERNEL = amax.S DAMAXKERNEL = amax.S @@ -140,8 +151,8 @@ DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMINCOPY = dgemm_ncopy_sve_v1.c DGEMMITCOPY = dgemm_tcopy_sve_v1.c -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) From 00f44bfff74e7173a881c4d6849deb75b9dfbd6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Fri, 21 Jan 2022 13:27:17 +0100 Subject: [PATCH 650/681] cmake: Check if Fortran compiler is usable before enabling it. --- cmake/f_check.cmake | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 0f5d0e15d..14683ed21 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -20,19 +20,16 @@ # NEEDBUNDERSCORE # NEED2UNDERSCORES -if (NOT NO_LAPACK) - include(CheckLanguage) - check_language(Fortran) - if(CMAKE_Fortran_COMPILER) - enable_language(Fortran) - else() - message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") +include(CheckLanguage) +check_language(Fortran) +if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) +else() + if (NOT NO_LAPACK) + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + endif() set (NOFORTRAN 1) set (NO_LAPACK 1) - endif() -else() - include(CMakeForceCompiler) - CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif() if (NOT ONLY_CBLAS) From 1937b4e435cce48dbf8d7d124800e03e1ba5d30d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:27:38 +0100 Subject: [PATCH 651/681] Add Elbrus e2k architecture detection --- c_check | 7 +++++++ ctest.c | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/c_check b/c_check index 030f5e632..999f5a7a7 100644 --- a/c_check +++ b/c_check @@ -84,6 +84,7 @@ $os = Haiku if ($data =~ /OS_HAIKU/); $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); @@ -124,6 +125,11 @@ if ($architecture eq "zarch") { $binary = 64; } +if ($architecture eq "e2k") { + $defined = 1; + $binary = 64; +} + if ($architecture eq "alpha") { $defined = 1; $binary = 64; @@ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); $architecture = power if ($data =~ /ARCH_POWER/); $architecture = mips if ($data =~ /ARCH_MIPS/); $architecture = mips64 if ($data =~ /ARCH_MIPS64/); diff --git a/ctest.c b/ctest.c index 2afd93f68..fc52b43a6 100644 --- a/ctest.c +++ b/ctest.c @@ -165,3 +165,7 @@ ARCH_LOONGARCH64 HAVE_C11 #endif +#if defined(__e2k__) +ARCH_E2K +#endif + From bc93f468ef98c7bb76bdcaf779e9dbe7231303b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:53:38 +0100 Subject: [PATCH 652/681] Add Elbrus E2000 architecture as generic x86_64 compatible --- Makefile.e2k | 1 + TargetList.txt | 4 ++++ common.h | 4 ++++ common_e2k.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ common_macro.h | 2 +- getarch.c | 11 +++++++++ 6 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 Makefile.e2k create mode 100644 common_e2k.h diff --git a/Makefile.e2k b/Makefile.e2k new file mode 100644 index 000000000..a5e50b1f0 --- /dev/null +++ b/Makefile.e2k @@ -0,0 +1 @@ +COPT = -Wall -O2 # -DGEMMTEST diff --git a/TargetList.txt b/TargetList.txt index 97c8a8f06..a5a07a661 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -115,3 +115,7 @@ C910V 11.LOONGARCH64: LOONGSON3R5 + +12. Elbrus E2000: +E2K + diff --git a/common.h b/common.h index ff5254a5c..00d1d0baf 100644 --- a/common.h +++ b/common.h @@ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_loongarch64.h" #endif +#ifdef ARCH_E2K +#include "common_e2k.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; diff --git a/common_e2k.h b/common_e2k.h new file mode 100644 index 000000000..0739c9473 --- /dev/null +++ b/common_e2k.h @@ -0,0 +1,64 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_E2K +#define COMMON_E2K + +#ifdef ASSEMBLER +#error +#endif + +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define RMB + +#define INLINE __attribute__((__always_inline__)) inline + +static inline int blas_quickdivide(blasint x, blasint y) { + return x / y; +} + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#ifndef BUFFERSIZE +#define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif + +#define SEEK_ADDRESS + +#endif + diff --git a/common_macro.h b/common_macro.h index cf2a3fd88..9826f1809 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2611,7 +2611,7 @@ #ifndef ASSEMBLER #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ -|| defined(ARCH_LOONGARCH64) +|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/getarch.c b/getarch.c index 73bbf1892..00e544bc7 100644 --- a/getarch.c +++ b/getarch.c @@ -1536,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(FORCE_E2K) || defined(__e2k__) +#define FORCE +#define ARCHITECTURE "E2K" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "generic" +#define CORENAME "generic" +#endif + #ifndef FORCE #ifdef USER_TARGET From 898cf5faf3fa3eaa6566c45276f7c6ba08082318 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:55:10 +0100 Subject: [PATCH 653/681] Add Elbrus e2k architecture support --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 2d5740183..bea6cb048 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ +ifeq ($(ARCH), E2K) +USE_TRMM = 1 +endif + ifeq ($(BUILD_BFLOAT16), 1) From 3492bea60225d795deb4e1b507914482133fc6a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:57:28 +0100 Subject: [PATCH 654/681] Create Makefile --- kernel/e2k/Makefile | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/e2k/Makefile diff --git a/kernel/e2k/Makefile b/kernel/e2k/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/e2k/Makefile @@ -0,0 +1 @@ +clean :: From 299d4d70a371c9fed9792daeb80329fd7961f841 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 18:59:36 +0100 Subject: [PATCH 655/681] Add default KERNEL file for Elbrus E2K arch --- kernel/e2k/KERNEL | 149 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 kernel/e2k/KERNEL diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/e2k/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + From 66a15e15a87a7e89d7341006edd013f3b2843468 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 19:02:57 +0100 Subject: [PATCH 656/681] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 5378c79bf..7e23dec8b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -204,3 +204,6 @@ In chronological order: * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions * [2022-01-18] SVE kernels and copy functions for TRSM + +* Ilya Kurdyukov + * [2021-02-21] Add basic support for the Elbrus E2000 architecture From 5d24f3d2102270e0cdc00823a06a35c2993bc361 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Jan 2022 19:09:00 +0100 Subject: [PATCH 657/681] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7e23dec8b..92be1fe42 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -205,5 +205,5 @@ In chronological order: * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions * [2022-01-18] SVE kernels and copy functions for TRSM -* Ilya Kurdyukov +* Ilya Kurdyukov * [2021-02-21] Add basic support for the Elbrus E2000 architecture From addc2a7aaa46eb1501a7c9c153951051eb82442d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 19:56:32 +0100 Subject: [PATCH 658/681] Add proper defaults for IMIN/IMAX --- kernel/sparc/KERNEL | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index 594fd05e5..a8c958bb4 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax.S +ISMINKERNEL = imax.S endif ifndef IDMINKERNEL -IDMINKERNEL = iamax.S +IDMINKERNEL = imax.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = imax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = imax.S endif ifndef SNRM2KERNEL From 7f0b11fbc189e95c8ee2fd249980962b9f5a1125 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 22:00:39 +0100 Subject: [PATCH 659/681] Exclude some complex drivers when NO_LAPACK is set --- driver/level2/Makefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/driver/level2/Makefile b/driver/level2/Makefile index caecf4f97..9bef6e2a5 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -64,9 +64,9 @@ CBLASOBJS += \ chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ - csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ - cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ - csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ + csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \ + cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ + csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ @@ -92,6 +92,13 @@ CBLASOBJS += \ ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) +ifndef NO_LAPACK +CBLASOBJS += \ + cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ + cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ + csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) +endif + ZBLASOBJS += \ zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ From d2b5fbf80f02539243cca20b496b0358d2829420 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Jan 2022 22:02:08 +0100 Subject: [PATCH 660/681] Exclude some complex (LAPACK) functions when NO_LAPACK is set --- interface/CMakeLists.txt | 19 +++++++++++++++---- interface/Makefile | 8 ++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ccb5fce3f..0b2998237 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c + trsv.c trmv.c + syr2.c gbmv.c + sbmv.c + spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) +set(BLAS2_REAL_ONLY_SOURCES + symv.c syr.c spmv.c spr.c +) +set(BLAS2_COMPLEX_LAPACK_SOURCES + symv.c syr.c spmv.c spr.c +) + set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES hemv.c hbmv.c her.c her2.c @@ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) + if (NOT DEFINED NO_LAPACK) + GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + endif () GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) diff --git a/interface/Makefile b/interface/Makefile index 3252601d2..f57d0bda0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) From a3eea3e127fb9f3682e1e132c75b515d7b7d5241 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Feb 2022 11:43:17 +0100 Subject: [PATCH 661/681] Fix input argument check (LAPACK PR 646) --- lapack-netlib/SRC/cgeqrt2.f | 11 ++++------- lapack-netlib/SRC/dgeqrt2.f | 11 ++++------- lapack-netlib/SRC/sgeqrt2.f | 11 ++++------- lapack-netlib/SRC/zgeqrt2.f | 11 ++++------- 4 files changed, 16 insertions(+), 28 deletions(-) diff --git a/lapack-netlib/SRC/cgeqrt2.f b/lapack-netlib/SRC/cgeqrt2.f index 9ee3e4f79..11221636d 100644 --- a/lapack-netlib/SRC/cgeqrt2.f +++ b/lapack-netlib/SRC/cgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complexGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/dgeqrt2.f b/lapack-netlib/SRC/dgeqrt2.f index 138dd4d9c..00f800d43 100644 --- a/lapack-netlib/SRC/dgeqrt2.f +++ b/lapack-netlib/SRC/dgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup doubleGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/sgeqrt2.f b/lapack-netlib/SRC/sgeqrt2.f index 349fd4b60..f6532f812 100644 --- a/lapack-netlib/SRC/sgeqrt2.f +++ b/lapack-netlib/SRC/sgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup realGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/zgeqrt2.f b/lapack-netlib/SRC/zgeqrt2.f index bad708498..34d9d544f 100644 --- a/lapack-netlib/SRC/zgeqrt2.f +++ b/lapack-netlib/SRC/zgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complex16GEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN From aec32e5bd4cdc6d69a04000ae9530983eec0e756 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Feb 2022 22:39:03 +0100 Subject: [PATCH 662/681] Update azure-pipelines.yml --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 710940924..04ed428de 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -224,7 +224,7 @@ jobs: - job: OSX_IOS_ARMV8 pool: - vmImage: 'macOS-10.15' + vmImage: 'macOS-11' variables: CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 From f7e8f9ec57dcbe7c3a94a18575f0379dfe828dae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Feb 2022 00:00:15 +0100 Subject: [PATCH 663/681] Support AVX512-enabled AlderLake --- cpuid_x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 6466bd148..d7d85eb20 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1495,6 +1495,10 @@ int get_cpuname(void){ switch (model) { case 7: // Alder Lake desktop case 10: // Alder Lake mobile + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) From fa3e9f25e633d5eb735e9183dfa72b6ed09fee0e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Feb 2022 00:00:56 +0100 Subject: [PATCH 664/681] Support AVX512-enabled Alder Lake --- driver/others/dynamic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index b12fb069a..52a7c6087 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){ case 9: if (model == 7 || model == 10) { // Alder Lake + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); return &gotoblas_HASWELL; } if(support_avx()) { From e2bf3f31a6e75223d864ffeb39c12bb3c68393e3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Feb 2022 22:09:25 +0100 Subject: [PATCH 665/681] Add .NOTPARALLEL: as a workaround for builds on DFS --- lapack-netlib/TESTING/MATGEN/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e21ebd6c3..0b94e3aaa 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ endif .PHONY: all +.NOTPARALLEL: all: $(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ From 0e04710099df5dd9369d49d435488c6f3705691a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Feb 2022 23:03:05 +0100 Subject: [PATCH 666/681] filter out libflangmain as well --- f_check | 1 + 1 file changed, 1 insertion(+) diff --git a/f_check b/f_check index 4825fb09a..71293b53f 100644 --- a/f_check +++ b/f_check @@ -361,6 +361,7 @@ if ($link ne "") { ($flags =~ /^\-l/) && ($flags !~ /ibrary/) && ($flags !~ /gfortranbegin/) + && ($flags !~ /flangmain/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) && ($flags !~ /crt[0-9]/) From db7a03dd4c414c8053090bf5bcc18f0fc8e01095 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Feb 2022 23:04:45 +0100 Subject: [PATCH 667/681] keep flang-classic on MacOS from trying to create an executable instead of a library --- exports/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 903836dd6..baaa33623 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN))) else ifeq ($(F_COMPILER), INTEL) $(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def +else +ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif endif +endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< From c352ac0ae3593a30262b20d54f95c19f517b56a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Feb 2022 22:16:04 +0100 Subject: [PATCH 668/681] Update with 0.3.20 changes --- Changelog.txt | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 180f7adec..97af4cbd9 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,39 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.20 + 20-Feb-2022 + +general: + - some code cleanup, with added casts etc. + - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset + - fixed pivot index calculation by ?LASWP for negative increments other than one + - fixed input argument check in LAPACK ? GEQRT2 + - improved the check for a Fortran compiler in CMAKE builds + - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 + - fixed building of LAPACK on certain distributed filesystems with parallel gmake + - fixed building the shared library on MacOS with classic flang + +x86_64: + - fixed cross-compilation with CMAKE for CORE2 target + - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds + - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS + +E2K: + - add new architecture (Russian Elbrus E2000 family) + +SPARC: + - fix IMIN/IMAX + +ARMV8: + - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX + - added support for Neoverse N2 and V1 cpus + +MIPS,MIPS64: + - fixed autodetection of MSA capability + +LOONGARCH64: + - added an optimized DGEMM kernel + ==================================================================== Version 0.3.19 19-Dec-2021 From dec53e0ca2a6a9ace5e716297d3679f62609fcf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Feb 2022 22:30:50 +0100 Subject: [PATCH 669/681] Update version to 0.3.20 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1d69da13..73498a7fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 19) +set(OpenBLAS_PATCH_VERSION 20) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 0b678b19dc03f2a999d6e038814c4c50b9640a4e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Feb 2022 22:35:05 +0100 Subject: [PATCH 670/681] Update version to 0.3.20 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 4b4b9bcf9..ea093bce6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.19.dev +VERSION = 0.3.20 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 73ffabe6ba46f167f5f51596ce9f4f3da02e551d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:06:14 +0100 Subject: [PATCH 671/681] Guard uses of _mm512_reduce_add_p? --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 5 +++++ kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 5 +++++ kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 5 +++++ kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 5 +++++ 4 files changed, 20 insertions(+) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index d9b380fff..5d7b3c66b 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -588,3 +589,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_nn.c +#endif + diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 18c797283..e63873988 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -320,3 +321,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_tn.c +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 9bc7a7c58..215add010 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -610,3 +611,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_nn.c +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 5a9a4ea32..f394b5b3a 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,6 +24,7 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) #include #include "common.h" @@ -314,3 +315,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp } return 0; } +#else +#include ../generic/gemm_small_matrix_kernel_tn.c +#endif + From 80eb581c838349ac4eef08cd688b2754f47e88ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:10:59 +0100 Subject: [PATCH 672/681] Fix non-portable u_int64_t --- kernel/x86_64/sbgemm_ncopy_16_cooperlake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c index 95ed82d7c..7ed03d70d 100644 --- a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -135,7 +135,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, }; - u_int64_t permute_table2[] = { + uint64_t permute_table2[] = { 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, }; From c62f8e2c01bfc2f4fad800be198108bf0f7a7e61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Feb 2022 20:12:20 +0100 Subject: [PATCH 673/681] Prevent compiler attempts to use k0 as mask register --- kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c index 7af51b6d8..b94aa3c84 100644 --- a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -114,10 +114,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) #define _MASK_STORE_C_2nx16(addr, val0, val1) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ - asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask)) #define _REORDER_C_2X(result_0, result_1) { \ __m512 tmp0, tmp1; \ @@ -154,8 +154,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); #define _MASK_STORE_C_16(addr, val0) \ - asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ - asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); #define N_STORE_4X(A, Bx, By) { \ _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ From abbc947edb830af96fc72ce7789f954737805830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Wed, 23 Feb 2022 22:51:59 +0000 Subject: [PATCH 674/681] Fix compilation of Skylake AVX512 kernels with GCC 6 --- kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 2 +- kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 2 +- kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index 5d7b3c66b..df6c65ff7 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -590,6 +590,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_nn.c +#include "../generic/gemm_small_matrix_kernel_nn.c" #endif diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index e63873988..37d1ca497 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -322,6 +322,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_tn.c +#include "../generic/gemm_small_matrix_kernel_tn.c" #endif diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 215add010..cea63172b 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -612,6 +612,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_nn.c +#include "../generic/gemm_small_matrix_kernel_nn.c" #endif diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index f394b5b3a..308f5e35e 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -316,6 +316,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alp return 0; } #else -#include ../generic/gemm_small_matrix_kernel_tn.c +#include "../generic/gemm_small_matrix_kernel_tn.c" #endif From d9894f45d30e82fd1491ae38477a1fcd79faeed1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 10:04:00 +0100 Subject: [PATCH 675/681] Define sbgemm_r to fix DYNAMIC_ARCH builds --- kernel/setparam-ref.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fe796be64..a81b32ddc 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1824,6 +1824,13 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif +#if BUILD_BFLOAT16==1 + TABLE_NAME.sbgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sbgemm_p * TABLE_NAME.sbgemm_q * 4 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.sbgemm_q * 4) - 15) & ~15); +#endif + #if BUILD_SINGLE==1 TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA From 9d7429406f0950113c989105eef9c5ee6cad01d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 10:05:36 +0100 Subject: [PATCH 676/681] Declare SHUFFLE_MAGIC_NO as const to placate clang --- kernel/x86_64/sbgemm_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index b8ed9838e..4a4e46f44 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -356,7 +356,7 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; + const int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA From 0698212c8c7318fd76cb366d27663b2c20856748 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 15:33:02 +0100 Subject: [PATCH 677/681] Remove stray $ --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8aa6728d5..98c803e71 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -678,7 +678,7 @@ endif () set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) endif () if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) - set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) endif () GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") From 9c626e466ed52edeff947607b01a580f549dc204 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Feb 2022 15:36:02 +0100 Subject: [PATCH 678/681] really fix definition of SHUFFLE_MAGIC_NO --- kernel/x86_64/sbgemm_microk_cooperlake_template.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c index 4a4e46f44..bd5cbb744 100644 --- a/kernel/x86_64/sbgemm_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -13,6 +13,8 @@ #define ONE 1.e0f #define ZERO 0.e0f +#define SHUFFLE_MAGIC_NO (const int) 0x39 + #undef STORE16_COMPLETE_RESULT #undef STORE16_MASK_COMPLETE_RESULT #undef SBGEMM_BLOCK_KERNEL_NN_32x8xK @@ -356,7 +358,6 @@ void sbgemm_block_kernel_nn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - const int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -465,7 +466,6 @@ void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1192,7 +1192,6 @@ void sbgemm_block_kernel_tn_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA @@ -1291,7 +1290,6 @@ void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, floa bfloat16 * B_addr = B; float * C_addr = C; - int SHUFFLE_MAGIC_NO = 0x39; BLASLONG tag_k_32x = k & (~31); #ifndef ONE_ALPHA From 35d5105922445adeec359d42cf5972df88e213af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:23:40 +0100 Subject: [PATCH 679/681] Enable xGEMMT functions --- relapack/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relapack/config.h b/relapack/config.h index e4fab0a12..9d6919463 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 0 +#define INCLUDE_XGEMMT 1 #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT From 4058f324923cd293b2117eafb65bc758a9a34a19 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:24:27 +0100 Subject: [PATCH 680/681] Fix xGEMMT argument lists --- relapack/src/lapack_wrappers.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 0252f3d92..fc3dbc11e 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -566,7 +566,8 @@ void LAPACK(sgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -578,7 +579,8 @@ void LAPACK(dgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -590,7 +592,8 @@ void LAPACK(cgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -602,6 +605,7 @@ void LAPACK(zgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif From 1c1ffb0591186e50311670369dee2cb450980d9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Feb 2022 19:27:34 +0100 Subject: [PATCH 681/681] Annotate LAPACKE_lsame with the const attribute for GCC and compatible compilers --- lapack-netlib/LAPACKE/include/lapacke_utils.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index a9236d23f..ec29f24fc 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -67,7 +67,11 @@ extern "C" { void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ -lapack_logical LAPACKE_lsame( char ca, char cb ); +lapack_logical LAPACKE_lsame( char ca, char cb ) +#if defined __GNUC__ + __attribute__((const)) +#endif + ; /* Functions to convert column-major to row-major 2d arrays and vice versa. */ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,