diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cf0c784c..aa0aefc9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,11 +15,14 @@ jobs: strategy: fail-fast: false matrix: - build: [linux-release, windows-release] + build: [linux-release, windows-release, osx-release] include: - build: linux-release os: ubuntu-latest config: release + - build: osx-release + os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/ + config: release - build: windows-release os: windows-2019 config: release @@ -27,8 +30,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-dotnet@v4 with: - dotnet-version: | - 7.0.x + dotnet-version: | 8.0.x - name: Cache Packages uses: actions/cache@v4 @@ -43,7 +45,7 @@ jobs: - name: Build run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore - name: Test - run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt + run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI - name: Upload artifacts if: always() uses: actions/upload-artifact@v3 diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index d8c366bc..b72f49a0 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory } /// - public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length; + public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length; } } diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index de6373ee..e3d18b3c 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -1,13 +1,7 @@ using LLama; -using LLama.Abstractions; using LLama.Common; using LLama.Native; using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace LLamaSharp.KernelMemory { @@ -111,6 +105,6 @@ namespace LLamaSharp.KernelMemory } /// - public int CountTokens(string text) => _context.Tokenize(text).Length; + public int CountTokens(string text) => _context.Tokenize(text, special: true).Length; } } diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs index 7c897b78..1d54a7e9 100644 --- a/LLama.Unittest/BasicTest.cs +++ b/LLama.Unittest/BasicTest.cs @@ -17,7 +17,8 @@ namespace LLama.Unittest _testOutputHelper = testOutputHelper; _params = new ModelParams(Constants.GenerativeModelPath) { - ContextSize = 2048 + ContextSize = 2048, + GpuLayerCount = Constants.CIGpuLayerCount }; _model = LLamaWeights.LoadFromFile(_params); } diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs index f4aa01ab..88b25672 100644 --- a/LLama.Unittest/BeamTests.cs +++ b/LLama.Unittest/BeamTests.cs @@ -17,7 +17,8 @@ public sealed class BeamTests _testOutputHelper = testOutputHelper; _params = new ModelParams(Constants.GenerativeModelPath) { - ContextSize = 2048 + ContextSize = 2048, + GpuLayerCount = Constants.CIGpuLayerCount, }; _model = LLamaWeights.LoadFromFile(_params); } @@ -27,7 +28,6 @@ public sealed class BeamTests _model.Dispose(); } - //[Fact(Skip = "Very very slow in CI")] [Fact] public void BasicBeam() { diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs index 6e5e92c5..4852a335 100644 --- a/LLama.Unittest/Constants.cs +++ b/LLama.Unittest/Constants.cs @@ -1,4 +1,6 @@ -namespace LLama.Unittest +using System.Runtime.InteropServices; + +namespace LLama.Unittest { internal static class Constants { @@ -8,5 +10,25 @@ public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf"; public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf"; public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg"; + + /// + /// Calculate GpuLayer Count to use in UnitTest + /// + /// Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI) + public static int CIGpuLayerCount + { + get + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + #if DEBUG + return 20; + #else + return 0; + #endif + } + else return 20; + } + } } } diff --git a/LLama.Unittest/GrammarTest.cs b/LLama.Unittest/GrammarTest.cs index 1ab9dea6..d4f6a95b 100644 --- a/LLama.Unittest/GrammarTest.cs +++ b/LLama.Unittest/GrammarTest.cs @@ -16,6 +16,7 @@ namespace LLama.Unittest { ContextSize = 2048, Seed = 92, + GpuLayerCount = Constants.CIGpuLayerCount, }; _model = LLamaWeights.LoadFromFile(_params); } diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs index fe247c6e..cc53e369 100644 --- a/LLama.Unittest/LLamaContextTests.cs +++ b/LLama.Unittest/LLamaContextTests.cs @@ -14,6 +14,7 @@ namespace LLama.Unittest var @params = new ModelParams(Constants.GenerativeModelPath) { ContextSize = 768, + GpuLayerCount = Constants.CIGpuLayerCount, }; _weights = LLamaWeights.LoadFromFile(@params); _context = _weights.CreateContext(@params); diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs index 379b8dc6..e9d9359f 100644 --- a/LLama.Unittest/LLamaEmbedderTests.cs +++ b/LLama.Unittest/LLamaEmbedderTests.cs @@ -1,32 +1,15 @@ -using LLama.Common; -using LLama.Native; +using LLama.Common; using Xunit.Abstractions; -using Xunit.Sdk; namespace LLama.Unittest; public sealed class LLamaEmbedderTests - : IDisposable { private readonly ITestOutputHelper _testOutputHelper; - private readonly LLamaEmbedder _embedder; public LLamaEmbedderTests(ITestOutputHelper testOutputHelper) { _testOutputHelper = testOutputHelper; - var @params = new ModelParams(Constants.EmbeddingModelPath) - { - ContextSize = 4096, - Threads = 5, - Embeddings = true, - }; - using var weights = LLamaWeights.LoadFromFile(@params); - _embedder = new(weights, @params); - } - - public void Dispose() - { - _embedder.Dispose(); } private static float Dot(float[] a, float[] b) @@ -35,17 +18,25 @@ public sealed class LLamaEmbedderTests return a.Zip(b, (x, y) => x * y).Sum(); } - - [Fact] - public async Task EmbedCompare() + private async Task CompareEmbeddings(string modelPath) { - var cat = await _embedder.GetEmbeddings("The cat is cute"); + var @params = new ModelParams(modelPath) + { + ContextSize = 8, + Threads = 4, + Embeddings = true, + GpuLayerCount = Constants.CIGpuLayerCount, + }; + using var weights = LLamaWeights.LoadFromFile(@params); + using var embedder = new LLamaEmbedder(weights, @params); + + var cat = await embedder.GetEmbeddings("The cat is cute"); Assert.DoesNotContain(float.NaN, cat); - var kitten = await _embedder.GetEmbeddings("The kitten is kawaii"); + var kitten = await embedder.GetEmbeddings("The kitten is kawaii"); Assert.DoesNotContain(float.NaN, kitten); - var spoon = await _embedder.GetEmbeddings("The spoon is not real"); + var spoon = await embedder.GetEmbeddings("The spoon is not real"); Assert.DoesNotContain(float.NaN, spoon); _testOutputHelper.WriteLine($"Cat = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]"); @@ -61,4 +52,16 @@ public sealed class LLamaEmbedderTests Assert.True(close < far); } + + [Fact] + public async Task EmbedCompareEmbeddingModel() + { + await CompareEmbeddings(Constants.EmbeddingModelPath); + } + + [Fact] + public async Task EmbedCompareGenerateModel() + { + await CompareEmbeddings(Constants.GenerativeModelPath); + } } \ No newline at end of file diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs index e5df3073..30d41fd9 100644 --- a/LLama.Unittest/LLavaWeightsTests.cs +++ b/LLama.Unittest/LLavaWeightsTests.cs @@ -17,7 +17,8 @@ namespace LLama.Unittest var @params = new ModelParams(Constants.GenerativeModelPath) { // Llava models requires big context - ContextSize = 4096 + ContextSize = 4096, + GpuLayerCount = Constants.CIGpuLayerCount, }; _llamaWeights = LLamaWeights.LoadFromFile(@params); _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath); @@ -32,7 +33,7 @@ namespace LLama.Unittest _lLavaWeights.Dispose(); } - [Fact(Skip = "Very very slow in CI")] + [Fact,Trait("Category", "NoCI")] public void EmbedImageAsFileName() { int n_past = 0; @@ -40,7 +41,7 @@ namespace LLama.Unittest Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) ); } - [Fact(Skip = "Very very slow in CI")] + [Fact,Trait("Category", "NoCI")] public void EmbedImageAsBinary() { int n_past = 0; diff --git a/LLama.Unittest/MemoryDisposalTests.cs b/LLama.Unittest/MemoryDisposalTests.cs index e29ad46d..60cd75fd 100644 --- a/LLama.Unittest/MemoryDisposalTests.cs +++ b/LLama.Unittest/MemoryDisposalTests.cs @@ -9,7 +9,8 @@ public class MemoryDisposalTests { var @params = new ModelParams(Constants.GenerativeModelPath) { - ContextSize = 2048 + ContextSize = 2048, + GpuLayerCount = 0, }; var model = LLamaWeights.LoadFromFile(@params); @@ -23,7 +24,8 @@ public class MemoryDisposalTests { var @params = new ModelParams(Constants.GenerativeModelPath) { - ContextSize = 2048 + ContextSize = 2048, + GpuLayerCount = Constants.CIGpuLayerCount, }; var model = LLamaWeights.LoadFromFile(@params); diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs index 3ca8a76e..18f3c25d 100644 --- a/LLama.Unittest/StatelessExecutorTest.cs +++ b/LLama.Unittest/StatelessExecutorTest.cs @@ -20,6 +20,7 @@ namespace LLama.Unittest ContextSize = 60, Seed = 1754, BatchSize = 2, + GpuLayerCount = Constants.CIGpuLayerCount, }; _weights = LLamaWeights.LoadFromFile(_params); } diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs index c11e3ae9..03e3927f 100644 --- a/LLama.Unittest/TokenTests.cs +++ b/LLama.Unittest/TokenTests.cs @@ -14,7 +14,8 @@ public sealed class TokenTests { _params = new ModelParams(Constants.GenerativeModelPath) { - ContextSize = 2048 + ContextSize = 2048, + GpuLayerCount = Constants.CIGpuLayerCount, }; _model = LLamaWeights.LoadFromFile(_params); } diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs index 13a3e1c2..f60f3cd5 100644 --- a/LLama/LLamaEmbedder.cs +++ b/LLama/LLamaEmbedder.cs @@ -97,11 +97,18 @@ namespace LLama private float[] GetEmbeddingsArray() { - var embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero); - if (embeddings.Length == 0) - return Array.Empty(); + unsafe + { + var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle); + + if (embeddings == null) + embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero); - return embeddings.ToArray(); + if (embeddings == null) + return Array.Empty(); + + return new Span(embeddings, Context.EmbeddingSize).ToArray(); + } } private static void Normalize(Span embeddings) @@ -112,6 +119,7 @@ namespace LLama lengthSqr += value * value; var length = (float)Math.Sqrt(lengthSqr); + // Do not divide by length if it is zero if (length <= float.Epsilon) return; diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs index c3a9a420..917dc5eb 100644 --- a/LLama/LLamaInstructExecutor.cs +++ b/LLama/LLamaInstructExecutor.cs @@ -38,8 +38,8 @@ namespace LLama ILogger? logger = null) : base(context, logger) { - _inp_pfx = Context.Tokenize(instructionPrefix, true); - _inp_sfx = Context.Tokenize(instructionSuffix, false); + _inp_pfx = Context.Tokenize(instructionPrefix, true, true); + _inp_sfx = Context.Tokenize(instructionSuffix, false, true); _instructionPrefix = instructionPrefix; } @@ -124,7 +124,7 @@ namespace LLama if (_is_prompt_run) { // When running the first input (prompt) in inteactive mode, we should specially process it. - _embed_inps = Context.Tokenize(text, true).ToList(); + _embed_inps = Context.Tokenize(text, true, true).ToList(); } else { @@ -135,7 +135,7 @@ namespace LLama _consumedTokensCount = _embed_inps.Count; _embed_inps.AddRange(_inp_pfx); - var line_inp = Context.Tokenize(text, false); + var line_inp = Context.Tokenize(text, false, true); _embed_inps.AddRange(line_inp); _embed_inps.AddRange(_inp_sfx); diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs index 5acf4bd3..9aaa1ca2 100644 --- a/LLama/LLamaInteractExecutor.cs +++ b/LLama/LLamaInteractExecutor.cs @@ -119,7 +119,7 @@ namespace LLama // When running the first input (prompt) in interactive mode, we should specially process it. if (!this.IsMultiModal) { - _embed_inps = Context.Tokenize(text, true).ToList(); + _embed_inps = Context.Tokenize(text, true, true).ToList(); } else { @@ -135,7 +135,7 @@ namespace LLama if (!this.IsMultiModal) { - var line_inp = Context.Tokenize(text, false); + var line_inp = Context.Tokenize(text, false, true); _embed_inps.AddRange(line_inp); args.RemainedTokens -= line_inp.Length; } @@ -165,11 +165,11 @@ namespace LLama int imageIndex = text.IndexOf(""); // Tokenize segment 1 (before tag) string preImagePrompt = text.Substring(0, imageIndex); - var segment1 = Context.Tokenize(preImagePrompt, addBos ); + var segment1 = Context.Tokenize(preImagePrompt, addBos, true); // Remember the position to add the image embeddings _EmbedImagePosition = segment1.Length; string postImagePrompt = text.Substring(imageIndex + 7); - var segment2 = Context.Tokenize(postImagePrompt, false); + var segment2 = Context.Tokenize(postImagePrompt, false, true); _embed_inps.AddRange(segment1); _embed_inps.AddRange(segment2); usedTokens += (segment1.Length + segment2.Length); @@ -178,11 +178,11 @@ namespace LLama { if (addBos) { - _embed_inps = Context.Tokenize(text, true).ToList(); + _embed_inps = Context.Tokenize(text, true, true).ToList(); } else { - var line_inp = Context.Tokenize(text, false); + var line_inp = Context.Tokenize(text, false, true); _embed_inps.AddRange(line_inp); args.RemainedTokens -= line_inp.Length; } diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs index 487fe293..a3c52a02 100644 --- a/LLama/LLamaStatelessExecutor.cs +++ b/LLama/LLamaStatelessExecutor.cs @@ -90,7 +90,7 @@ namespace LLama lastTokens.Add(0); // Tokenize the prompt - var tokens = Context.Tokenize(prompt).ToList(); + var tokens = Context.Tokenize(prompt, special: true).ToList(); lastTokens.AddRange(tokens); // Evaluate the prompt, in chunks smaller than the max batch size diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 6f8d142e..ed456151 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -137,41 +137,17 @@ namespace LLama.Native /// Get the embeddings for the a specific sequence. /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd /// - /// - public static Span llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id) - { - unsafe - { - var ptr = llama_get_embeddings_seq_native(ctx, id); - if (ptr == null) - return Array.Empty(); - - return new Span(ptr, ctx.EmbeddingSize); - } - - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_seq")] - static extern unsafe float* llama_get_embeddings_seq_native(SafeLLamaContextHandle ctx, LLamaSeqId id); - } + /// A pointer to the first float in an embedding, length = ctx.EmbeddingSize + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe float* llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id); /// /// Get the embeddings for the ith sequence. /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd /// - /// - public static Span llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i) - { - unsafe - { - var ptr = llama_get_embeddings_ith_native(ctx, i); - if (ptr == null) - return Array.Empty(); - - return new Span(ptr, ctx.EmbeddingSize); - } - - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_ith")] - static extern unsafe float* llama_get_embeddings_ith_native(SafeLLamaContextHandle ctx, int i); - } + /// A pointer to the first float in an embedding, length = ctx.EmbeddingSize + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i); /// /// Get all output token embeddings. @@ -182,20 +158,8 @@ namespace LLama.Native /// /// /// - public static Span llama_get_embeddings(SafeLLamaContextHandle ctx) - { - unsafe - { - var ptr = llama_get_embeddings_native(ctx); - if (ptr == null) - return Array.Empty(); - - return new Span(ptr, ctx.EmbeddingSize); - } - - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings")] - static extern unsafe float* llama_get_embeddings_native(SafeLLamaContextHandle ctx); - } + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe float* llama_get_embeddings(SafeLLamaContextHandle ctx); /// /// Apply chat template. Inspired by hf apply_chat_template() on python.