| @@ -89,20 +89,21 @@ task: | |||
| type: text/plain | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest | |||
| task: | |||
| name: AppleM1/LLVM armv7-androidndk xbuild | |||
| compile_script: | |||
| - brew install android-ndk | |||
| - brew install --cask android-ndk | |||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | |||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - ls /System/Volumes/Data/opt/homebrew | |||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | |||
| - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" | |||
| - ls /opt/homebrew | |||
| - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk | |||
| - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" | |||
| - find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" | |||
| - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
| - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang | |||
| - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | |||
| always: | |||
| config_artifacts: | |||
| @@ -85,6 +85,8 @@ Examples: | |||
| make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 | |||
| ``` | |||
| When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. | |||
| ### Debug version | |||
| A debug version can be built using `make DEBUG=1`. | |||
| @@ -1527,6 +1527,19 @@ int get_cpuname(void){ | |||
| break; | |||
| case 10: //family 6 exmodel 10 | |||
| switch (model) { | |||
| case 13: // Granite Rapids | |||
| if(support_amx_bf16()) | |||
| return CPUTYPE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| case 10: // Meteor Lake | |||
| @@ -2352,8 +2365,22 @@ int get_coretype(void){ | |||
| case 10: | |||
| switch (model) { | |||
| case 13: // Granite Rapids | |||
| if(support_amx_bf16()) | |||
| return CORE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| case 10: // Meteor Lake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| @@ -2362,6 +2389,7 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 0: // Meteor Lake | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| @@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2); | |||
| //For target LOONGSON3R5, applying an offset to the buffer is essential | |||
| //for minimizing cache conflicts and optimizing performance. | |||
| #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | |||
| @@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c | |||
| # clatrs3 | |||
| lapackobjs2d="$lapackobjs2d | |||
| dgelqs | |||
| dgelst | |||
| dgeqp3rk | |||
| dgeqrs | |||
| dlaqp2rk | |||
| dlaqp3rk | |||
| dlarmm | |||
| @@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d | |||
| # dlaqz4 | |||
| lapackobjs2z="$lapackobjs2z | |||
| zgelqs | |||
| zgelst | |||
| zgeqp3rk | |||
| zgeqrs | |||
| zlaqp2rk | |||
| zlaqp3rk | |||
| zlatrs3 | |||
| @@ -918,6 +914,7 @@ lapack_extendedprecision_objs=" | |||
| " | |||
| lapack_deprecated_objsc=" | |||
| cgelqs cgeqrs | |||
| cgegs cggsvd | |||
| cgegv cggsvp | |||
| cgelsx clahrd | |||
| @@ -926,6 +923,7 @@ lapack_deprecated_objsc=" | |||
| " | |||
| lapack_deprecated_objsd=" | |||
| dgelqs dgeqrs | |||
| dgegs dgeqpf | |||
| dgegv dggsvd | |||
| dgelsx dggsvp | |||
| @@ -933,6 +931,8 @@ lapack_deprecated_objsd=" | |||
| dlatzm dtzrqf" | |||
| lapack_deprecated_objss=" | |||
| sgelqs | |||
| sgeqrs | |||
| sgelsx | |||
| sgegs | |||
| sgegv | |||
| @@ -945,6 +945,8 @@ lapack_deprecated_objss=" | |||
| " | |||
| lapack_deprecated_objsz=" | |||
| zgelqs | |||
| zgeqrs | |||
| zgegs | |||
| zgegv | |||
| zgelsx | |||
| @@ -131,11 +131,11 @@ | |||
| sd $21, 40($sp) | |||
| sd $22, 48($sp) | |||
| ST $f24, 56($sp) | |||
| ST $f25, 64($sp) | |||
| ST $f26, 72($sp) | |||
| ST $f27, 80($sp) | |||
| ST $f28, 88($sp) | |||
| sdc1 $f24, 56($sp) | |||
| sdc1 $f25, 64($sp) | |||
| sdc1 $f26, 72($sp) | |||
| sdc1 $f27, 80($sp) | |||
| sdc1 $f28, 88($sp) | |||
| #if defined(TRMMKERNEL) | |||
| sd $23, 96($sp) | |||
| @@ -146,10 +146,10 @@ | |||
| #endif | |||
| #ifndef __64BIT__ | |||
| ST $f20,120($sp) | |||
| ST $f21,128($sp) | |||
| ST $f22,136($sp) | |||
| ST $f23,144($sp) | |||
| sdc1 $f20,120($sp) | |||
| sdc1 $f21,128($sp) | |||
| sdc1 $f22,136($sp) | |||
| sdc1 $f23,144($sp) | |||
| #endif | |||
| .align 4 | |||
| @@ -4000,11 +4000,11 @@ | |||
| ld $21, 40($sp) | |||
| ld $22, 48($sp) | |||
| LD $f24, 56($sp) | |||
| LD $f25, 64($sp) | |||
| LD $f26, 72($sp) | |||
| LD $f27, 80($sp) | |||
| LD $f28, 88($sp) | |||
| ldc1 $f24, 56($sp) | |||
| ldc1 $f25, 64($sp) | |||
| ldc1 $f26, 72($sp) | |||
| ldc1 $f27, 80($sp) | |||
| ldc1 $f28, 88($sp) | |||
| #if defined(TRMMKERNEL) | |||
| ld $23, 96($sp) | |||
| @@ -4013,10 +4013,10 @@ | |||
| #endif | |||
| #ifndef __64BIT__ | |||
| LD $f20,120($sp) | |||
| LD $f21,128($sp) | |||
| LD $f22,136($sp) | |||
| LD $f23,144($sp) | |||
| ldc1 $f20,120($sp) | |||
| ldc1 $f21,128($sp) | |||
| ldc1 $f22,136($sp) | |||
| ldc1 $f23,144($sp) | |||
| #endif | |||
| daddiu $sp,$sp,STACKSIZE | |||
| @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "../common.h" | |||
| #define SGEMM BLASFUNC(sgemm) | |||
| #define SBGEMM BLASFUNC(sbgemm) | |||
| #define SGEMV BLASFUNC(sgemv) | |||
| #define SBGEMV BLASFUNC(sbgemv) | |||
| typedef union | |||
| { | |||
| unsigned short v; | |||
| @@ -187,7 +189,79 @@ main (int argc, char *argv[]) | |||
| free(CC); | |||
| } | |||
| if (ret != 0) | |||
| if (ret != 0) { | |||
| fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); | |||
| return ret; | |||
| } | |||
| k = 1; | |||
| for (x = 1; x <= loop; x++) | |||
| { | |||
| float *A = (float *)malloc(x * x * sizeof(FLOAT)); | |||
| float *B = (float *)malloc(x * sizeof(FLOAT)); | |||
| float *C = (float *)malloc(x * sizeof(FLOAT)); | |||
| bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits)); | |||
| bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits)); | |||
| float *DD = (float *)malloc(x * sizeof(FLOAT)); | |||
| float *CC = (float *)malloc(x * sizeof(FLOAT)); | |||
| if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || | |||
| (DD == NULL) || (CC == NULL)) | |||
| return 1; | |||
| bfloat16 atmp, btmp; | |||
| blasint one = 1; | |||
| for (j = 0; j < x; j++) | |||
| { | |||
| for (i = 0; i < x; i++) | |||
| { | |||
| A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||
| sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); | |||
| AA[j * x + i].v = atmp; | |||
| } | |||
| B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; | |||
| sbstobf16_(&one, &B[j], &one, &btmp, &one); | |||
| BB[j].v = btmp; | |||
| } | |||
| for (y = 0; y < 2; y++) | |||
| { | |||
| if (y == 0) { | |||
| transA = 'N'; | |||
| } else { | |||
| transA = 'T'; | |||
| } | |||
| memset(CC, 0, x * sizeof(FLOAT)); | |||
| memset(DD, 0, x * sizeof(FLOAT)); | |||
| memset(C, 0, x * sizeof(FLOAT)); | |||
| SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); | |||
| SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); | |||
| for (j = 0; j < x; j++) | |||
| for (i = 0; i < x; i++) | |||
| if (transA == 'N') { | |||
| DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); | |||
| } else if (transA == 'T') { | |||
| DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); | |||
| } | |||
| for (j = 0; j < x; j++) { | |||
| if (fabs (CC[j] - C[j]) > 1.0) | |||
| ret++; | |||
| if (fabs (CC[j] - DD[j]) > 1.0) | |||
| ret++; | |||
| } | |||
| } | |||
| free(A); | |||
| free(B); | |||
| free(C); | |||
| free(AA); | |||
| free(BB); | |||
| free(DD); | |||
| free(CC); | |||
| } | |||
| if (ret != 0) | |||
| fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); | |||
| return ret; | |||
| } | |||