| @@ -58,8 +58,8 @@ task: | |||||
| - export VALID_ARCHS="i386 x86_64" | - export VALID_ARCHS="i386 x86_64" | ||||
| - xcrun --sdk macosx --show-sdk-path | - xcrun --sdk macosx --show-sdk-path | ||||
| - xcodebuild -version | - xcodebuild -version | ||||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" | |||||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" | |||||
| - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" | ||||
| always: | always: | ||||
| config_artifacts: | config_artifacts: | ||||
| @@ -78,8 +78,8 @@ task: | |||||
| - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH | ||||
| - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" | ||||
| - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" | ||||
| - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| - xcrun --sdk iphoneos --show-sdk-path | - xcrun --sdk iphoneos --show-sdk-path | ||||
| - ls -l /Applications | - ls -l /Applications | ||||
| - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 | ||||
| @@ -7,7 +7,6 @@ | |||||
| #include <stdlib.h> | #include <stdlib.h> | ||||
| #include <inttypes.h> | #include <inttypes.h> | ||||
| #include <math.h> | #include <math.h> | ||||
| #if defined(HAVE_SME) | #if defined(HAVE_SME) | ||||
| /* Function prototypes */ | /* Function prototypes */ | ||||
| @@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | m_mod = ceil((double)M/(double)vl_elms) * vl_elms; | ||||
| float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); | ||||
| /* Prevent compiler optimization by reading from memory instead | |||||
| * of reading directly from vector (z) registers. | |||||
| * */ | |||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| /* Pre-process the left matrix to make it suitable for | /* Pre-process the left matrix to make it suitable for | ||||
| matrix sum of outer-product calculation | matrix sum of outer-product calculation | ||||
| */ | */ | ||||
| @@ -52,7 +61,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ | |||||
| /* Calculate C = A*B */ | /* Calculate C = A*B */ | ||||
| sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); | ||||
| asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", | |||||
| "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", | |||||
| "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", | |||||
| "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", | |||||
| "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", | |||||
| "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); | |||||
| free(A_mod); | free(A_mod); | ||||
| } | } | ||||