| @@ -15,11 +15,14 @@ jobs: | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| build: [linux-release, windows-release] | |||
| build: [linux-release, windows-release, osx-release] | |||
| include: | |||
| - build: linux-release | |||
| os: ubuntu-latest | |||
| config: release | |||
| - build: osx-release | |||
| os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/ | |||
| config: release | |||
| - build: windows-release | |||
| os: windows-2019 | |||
| config: release | |||
| @@ -27,8 +30,7 @@ jobs: | |||
| - uses: actions/checkout@v4 | |||
| - uses: actions/setup-dotnet@v4 | |||
| with: | |||
| dotnet-version: | | |||
| 7.0.x | |||
| dotnet-version: | | |||
| 8.0.x | |||
| - name: Cache Packages | |||
| uses: actions/cache@v4 | |||
| @@ -43,7 +45,7 @@ jobs: | |||
| - name: Build | |||
| run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore | |||
| - name: Test | |||
| run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt | |||
| run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI | |||
| - name: Upload artifacts | |||
| if: always() | |||
| uses: actions/upload-artifact@v3 | |||
| @@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory | |||
| } | |||
| /// <inheritdoc/> | |||
| public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length; | |||
| public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length; | |||
| } | |||
| } | |||
| @@ -1,13 +1,7 @@ | |||
| using LLama; | |||
| using LLama.Abstractions; | |||
| using LLama.Common; | |||
| using LLama.Native; | |||
| using Microsoft.KernelMemory.AI; | |||
| using System; | |||
| using System.Collections.Generic; | |||
| using System.Linq; | |||
| using System.Text; | |||
| using System.Threading.Tasks; | |||
| namespace LLamaSharp.KernelMemory | |||
| { | |||
| @@ -111,6 +105,6 @@ namespace LLamaSharp.KernelMemory | |||
| } | |||
| /// <inheritdoc/> | |||
| public int CountTokens(string text) => _context.Tokenize(text).Length; | |||
| public int CountTokens(string text) => _context.Tokenize(text, special: true).Length; | |||
| } | |||
| } | |||
| @@ -17,7 +17,8 @@ namespace LLama.Unittest | |||
| _testOutputHelper = testOutputHelper; | |||
| _params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 2048 | |||
| ContextSize = 2048, | |||
| GpuLayerCount = Constants.CIGpuLayerCount | |||
| }; | |||
| _model = LLamaWeights.LoadFromFile(_params); | |||
| } | |||
| @@ -17,7 +17,8 @@ public sealed class BeamTests | |||
| _testOutputHelper = testOutputHelper; | |||
| _params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 2048 | |||
| ContextSize = 2048, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _model = LLamaWeights.LoadFromFile(_params); | |||
| } | |||
| @@ -27,7 +28,6 @@ public sealed class BeamTests | |||
| _model.Dispose(); | |||
| } | |||
| //[Fact(Skip = "Very very slow in CI")] | |||
| [Fact] | |||
| public void BasicBeam() | |||
| { | |||
| @@ -1,4 +1,6 @@ | |||
| namespace LLama.Unittest | |||
| using System.Runtime.InteropServices; | |||
| namespace LLama.Unittest | |||
| { | |||
| internal static class Constants | |||
| { | |||
| @@ -8,5 +10,25 @@ | |||
| public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf"; | |||
| public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf"; | |||
| public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg"; | |||
| /// <summary> | |||
| /// Calculate GpuLayer Count to use in UnitTest | |||
| /// </summary> | |||
| /// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns> | |||
| public static int CIGpuLayerCount | |||
| { | |||
| get | |||
| { | |||
| if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) | |||
| { | |||
| #if DEBUG | |||
| return 20; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| else return 20; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -16,6 +16,7 @@ namespace LLama.Unittest | |||
| { | |||
| ContextSize = 2048, | |||
| Seed = 92, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _model = LLamaWeights.LoadFromFile(_params); | |||
| } | |||
| @@ -14,6 +14,7 @@ namespace LLama.Unittest | |||
| var @params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 768, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _weights = LLamaWeights.LoadFromFile(@params); | |||
| _context = _weights.CreateContext(@params); | |||
| @@ -1,32 +1,15 @@ | |||
| using LLama.Common; | |||
| using LLama.Native; | |||
| using LLama.Common; | |||
| using Xunit.Abstractions; | |||
| using Xunit.Sdk; | |||
| namespace LLama.Unittest; | |||
| public sealed class LLamaEmbedderTests | |||
| : IDisposable | |||
| { | |||
| private readonly ITestOutputHelper _testOutputHelper; | |||
| private readonly LLamaEmbedder _embedder; | |||
| public LLamaEmbedderTests(ITestOutputHelper testOutputHelper) | |||
| { | |||
| _testOutputHelper = testOutputHelper; | |||
| var @params = new ModelParams(Constants.EmbeddingModelPath) | |||
| { | |||
| ContextSize = 4096, | |||
| Threads = 5, | |||
| Embeddings = true, | |||
| }; | |||
| using var weights = LLamaWeights.LoadFromFile(@params); | |||
| _embedder = new(weights, @params); | |||
| } | |||
| public void Dispose() | |||
| { | |||
| _embedder.Dispose(); | |||
| } | |||
| private static float Dot(float[] a, float[] b) | |||
| @@ -35,17 +18,25 @@ public sealed class LLamaEmbedderTests | |||
| return a.Zip(b, (x, y) => x * y).Sum(); | |||
| } | |||
| [Fact] | |||
| public async Task EmbedCompare() | |||
| private async Task CompareEmbeddings(string modelPath) | |||
| { | |||
| var cat = await _embedder.GetEmbeddings("The cat is cute"); | |||
| var @params = new ModelParams(modelPath) | |||
| { | |||
| ContextSize = 8, | |||
| Threads = 4, | |||
| Embeddings = true, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| using var weights = LLamaWeights.LoadFromFile(@params); | |||
| using var embedder = new LLamaEmbedder(weights, @params); | |||
| var cat = await embedder.GetEmbeddings("The cat is cute"); | |||
| Assert.DoesNotContain(float.NaN, cat); | |||
| var kitten = await _embedder.GetEmbeddings("The kitten is kawaii"); | |||
| var kitten = await embedder.GetEmbeddings("The kitten is kawaii"); | |||
| Assert.DoesNotContain(float.NaN, kitten); | |||
| var spoon = await _embedder.GetEmbeddings("The spoon is not real"); | |||
| var spoon = await embedder.GetEmbeddings("The spoon is not real"); | |||
| Assert.DoesNotContain(float.NaN, spoon); | |||
| _testOutputHelper.WriteLine($"Cat = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]"); | |||
| @@ -61,4 +52,16 @@ public sealed class LLamaEmbedderTests | |||
| Assert.True(close < far); | |||
| } | |||
| [Fact] | |||
| public async Task EmbedCompareEmbeddingModel() | |||
| { | |||
| await CompareEmbeddings(Constants.EmbeddingModelPath); | |||
| } | |||
| [Fact] | |||
| public async Task EmbedCompareGenerateModel() | |||
| { | |||
| await CompareEmbeddings(Constants.GenerativeModelPath); | |||
| } | |||
| } | |||
| @@ -17,7 +17,8 @@ namespace LLama.Unittest | |||
| var @params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| // Llava models requires big context | |||
| ContextSize = 4096 | |||
| ContextSize = 4096, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _llamaWeights = LLamaWeights.LoadFromFile(@params); | |||
| _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath); | |||
| @@ -32,7 +33,7 @@ namespace LLama.Unittest | |||
| _lLavaWeights.Dispose(); | |||
| } | |||
| [Fact(Skip = "Very very slow in CI")] | |||
| [Fact,Trait("Category", "NoCI")] | |||
| public void EmbedImageAsFileName() | |||
| { | |||
| int n_past = 0; | |||
| @@ -40,7 +41,7 @@ namespace LLama.Unittest | |||
| Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) ); | |||
| } | |||
| [Fact(Skip = "Very very slow in CI")] | |||
| [Fact,Trait("Category", "NoCI")] | |||
| public void EmbedImageAsBinary() | |||
| { | |||
| int n_past = 0; | |||
| @@ -9,7 +9,8 @@ public class MemoryDisposalTests | |||
| { | |||
| var @params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 2048 | |||
| ContextSize = 2048, | |||
| GpuLayerCount = 0, | |||
| }; | |||
| var model = LLamaWeights.LoadFromFile(@params); | |||
| @@ -23,7 +24,8 @@ public class MemoryDisposalTests | |||
| { | |||
| var @params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 2048 | |||
| ContextSize = 2048, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| var model = LLamaWeights.LoadFromFile(@params); | |||
| @@ -20,6 +20,7 @@ namespace LLama.Unittest | |||
| ContextSize = 60, | |||
| Seed = 1754, | |||
| BatchSize = 2, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _weights = LLamaWeights.LoadFromFile(_params); | |||
| } | |||
| @@ -14,7 +14,8 @@ public sealed class TokenTests | |||
| { | |||
| _params = new ModelParams(Constants.GenerativeModelPath) | |||
| { | |||
| ContextSize = 2048 | |||
| ContextSize = 2048, | |||
| GpuLayerCount = Constants.CIGpuLayerCount, | |||
| }; | |||
| _model = LLamaWeights.LoadFromFile(_params); | |||
| } | |||
| @@ -97,11 +97,18 @@ namespace LLama | |||
| private float[] GetEmbeddingsArray() | |||
| { | |||
| var embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero); | |||
| if (embeddings.Length == 0) | |||
| return Array.Empty<float>(); | |||
| unsafe | |||
| { | |||
| var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle); | |||
| if (embeddings == null) | |||
| embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero); | |||
| return embeddings.ToArray(); | |||
| if (embeddings == null) | |||
| return Array.Empty<float>(); | |||
| return new Span<float>(embeddings, Context.EmbeddingSize).ToArray(); | |||
| } | |||
| } | |||
| private static void Normalize(Span<float> embeddings) | |||
| @@ -112,6 +119,7 @@ namespace LLama | |||
| lengthSqr += value * value; | |||
| var length = (float)Math.Sqrt(lengthSqr); | |||
| // Do not divide by length if it is zero | |||
| if (length <= float.Epsilon) | |||
| return; | |||
| @@ -38,8 +38,8 @@ namespace LLama | |||
| ILogger? logger = null) | |||
| : base(context, logger) | |||
| { | |||
| _inp_pfx = Context.Tokenize(instructionPrefix, true); | |||
| _inp_sfx = Context.Tokenize(instructionSuffix, false); | |||
| _inp_pfx = Context.Tokenize(instructionPrefix, true, true); | |||
| _inp_sfx = Context.Tokenize(instructionSuffix, false, true); | |||
| _instructionPrefix = instructionPrefix; | |||
| } | |||
| @@ -124,7 +124,7 @@ namespace LLama | |||
| if (_is_prompt_run) | |||
| { | |||
| // When running the first input (prompt) in inteactive mode, we should specially process it. | |||
| _embed_inps = Context.Tokenize(text, true).ToList(); | |||
| _embed_inps = Context.Tokenize(text, true, true).ToList(); | |||
| } | |||
| else | |||
| { | |||
| @@ -135,7 +135,7 @@ namespace LLama | |||
| _consumedTokensCount = _embed_inps.Count; | |||
| _embed_inps.AddRange(_inp_pfx); | |||
| var line_inp = Context.Tokenize(text, false); | |||
| var line_inp = Context.Tokenize(text, false, true); | |||
| _embed_inps.AddRange(line_inp); | |||
| _embed_inps.AddRange(_inp_sfx); | |||
| @@ -119,7 +119,7 @@ namespace LLama | |||
| // When running the first input (prompt) in interactive mode, we should specially process it. | |||
| if (!this.IsMultiModal) | |||
| { | |||
| _embed_inps = Context.Tokenize(text, true).ToList(); | |||
| _embed_inps = Context.Tokenize(text, true, true).ToList(); | |||
| } | |||
| else | |||
| { | |||
| @@ -135,7 +135,7 @@ namespace LLama | |||
| if (!this.IsMultiModal) | |||
| { | |||
| var line_inp = Context.Tokenize(text, false); | |||
| var line_inp = Context.Tokenize(text, false, true); | |||
| _embed_inps.AddRange(line_inp); | |||
| args.RemainedTokens -= line_inp.Length; | |||
| } | |||
| @@ -165,11 +165,11 @@ namespace LLama | |||
| int imageIndex = text.IndexOf("<image>"); | |||
| // Tokenize segment 1 (before <image> tag) | |||
| string preImagePrompt = text.Substring(0, imageIndex); | |||
| var segment1 = Context.Tokenize(preImagePrompt, addBos ); | |||
| var segment1 = Context.Tokenize(preImagePrompt, addBos, true); | |||
| // Remember the position to add the image embeddings | |||
| _EmbedImagePosition = segment1.Length; | |||
| string postImagePrompt = text.Substring(imageIndex + 7); | |||
| var segment2 = Context.Tokenize(postImagePrompt, false); | |||
| var segment2 = Context.Tokenize(postImagePrompt, false, true); | |||
| _embed_inps.AddRange(segment1); | |||
| _embed_inps.AddRange(segment2); | |||
| usedTokens += (segment1.Length + segment2.Length); | |||
| @@ -178,11 +178,11 @@ namespace LLama | |||
| { | |||
| if (addBos) | |||
| { | |||
| _embed_inps = Context.Tokenize(text, true).ToList(); | |||
| _embed_inps = Context.Tokenize(text, true, true).ToList(); | |||
| } | |||
| else | |||
| { | |||
| var line_inp = Context.Tokenize(text, false); | |||
| var line_inp = Context.Tokenize(text, false, true); | |||
| _embed_inps.AddRange(line_inp); | |||
| args.RemainedTokens -= line_inp.Length; | |||
| } | |||
| @@ -90,7 +90,7 @@ namespace LLama | |||
| lastTokens.Add(0); | |||
| // Tokenize the prompt | |||
| var tokens = Context.Tokenize(prompt).ToList(); | |||
| var tokens = Context.Tokenize(prompt, special: true).ToList(); | |||
| lastTokens.AddRange(tokens); | |||
| // Evaluate the prompt, in chunks smaller than the max batch size | |||
| @@ -137,41 +137,17 @@ namespace LLama.Native | |||
| /// Get the embeddings for the a specific sequence. | |||
| /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| public static Span<float> llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id) | |||
| { | |||
| unsafe | |||
| { | |||
| var ptr = llama_get_embeddings_seq_native(ctx, id); | |||
| if (ptr == null) | |||
| return Array.Empty<float>(); | |||
| return new Span<float>(ptr, ctx.EmbeddingSize); | |||
| } | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_seq")] | |||
| static extern unsafe float* llama_get_embeddings_seq_native(SafeLLamaContextHandle ctx, LLamaSeqId id); | |||
| } | |||
| /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe float* llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id); | |||
| /// <summary> | |||
| /// Get the embeddings for the ith sequence. | |||
| /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd | |||
| /// </summary> | |||
| /// <returns></returns> | |||
| public static Span<float> llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i) | |||
| { | |||
| unsafe | |||
| { | |||
| var ptr = llama_get_embeddings_ith_native(ctx, i); | |||
| if (ptr == null) | |||
| return Array.Empty<float>(); | |||
| return new Span<float>(ptr, ctx.EmbeddingSize); | |||
| } | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_ith")] | |||
| static extern unsafe float* llama_get_embeddings_ith_native(SafeLLamaContextHandle ctx, int i); | |||
| } | |||
| /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i); | |||
| /// <summary> | |||
| /// Get all output token embeddings. | |||
| @@ -182,20 +158,8 @@ namespace LLama.Native | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <returns></returns> | |||
| public static Span<float> llama_get_embeddings(SafeLLamaContextHandle ctx) | |||
| { | |||
| unsafe | |||
| { | |||
| var ptr = llama_get_embeddings_native(ctx); | |||
| if (ptr == null) | |||
| return Array.Empty<float>(); | |||
| return new Span<float>(ptr, ctx.EmbeddingSize); | |||
| } | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings")] | |||
| static extern unsafe float* llama_get_embeddings_native(SafeLLamaContextHandle ctx); | |||
| } | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe float* llama_get_embeddings(SafeLLamaContextHandle ctx); | |||
| /// <summary> | |||
| /// Apply chat template. Inspired by hf apply_chat_template() on python. | |||