Merge branch 'SciSharp:master' into feature/interactive-sk-chatcompletion

2 years ago · 05937de5dc
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
        build: [linux-release, windows-release]
        build: [linux-release, windows-release, osx-release]
        include:
          - build: linux-release
            os: ubuntu-latest
            config: release
          - build: osx-release  
            os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
            config: release            
          - build: windows-release
            os: windows-2019
            config: release
@@ -27,8 +30,7 @@ jobs:
    - uses: actions/checkout@v4
    - uses: actions/setup-dotnet@v4
      with:
        dotnet-version: | 
          7.0.x
        dotnet-version: |
          8.0.x
    - name: Cache Packages
      uses: actions/cache@v4
@@ -43,7 +45,7 @@ jobs:
    - name: Build
      run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
    - name: Test
      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI
    - name: Upload artifacts
      if: always()
      uses: actions/upload-artifact@v3
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory
        }

        /// <inheritdoc/>
        public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length;
        public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
    }
 }
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -1,13 +1,7 @@
 using LLama;
 using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using Microsoft.KernelMemory.AI;
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;

 namespace LLamaSharp.KernelMemory
 {
@@ -111,6 +105,6 @@ namespace LLamaSharp.KernelMemory
        }

        /// <inheritdoc/>
        public int CountTokens(string text) => _context.Tokenize(text).Length;
        public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
    }
 }
--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
            _testOutputHelper = testOutputHelper;
            _params = new ModelParams(Constants.GenerativeModelPath)
            {
                ContextSize = 2048
                ContextSize = 2048,
                GpuLayerCount = Constants.CIGpuLayerCount
            };
            _model = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@@ -17,7 +17,8 @@ public sealed class BeamTests
        _testOutputHelper = testOutputHelper;
        _params = new ModelParams(Constants.GenerativeModelPath)
        {
            ContextSize = 2048
            ContextSize = 2048,
            GpuLayerCount = Constants.CIGpuLayerCount,
        };
        _model = LLamaWeights.LoadFromFile(_params);
    }
@@ -27,7 +28,6 @@ public sealed class BeamTests
        _model.Dispose();
    }

    //[Fact(Skip = "Very very slow in CI")]
    [Fact]
    public void BasicBeam()
    {
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@@ -1,4 +1,6 @@
 namespace LLama.Unittest
 using System.Runtime.InteropServices;

 namespace LLama.Unittest
 {
    internal static class Constants
    {
@@ -8,5 +10,25 @@
        public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
        public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
        public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";

        /// <summary>
        /// Calculate GpuLayer Count to use in UnitTest
        /// </summary>
        /// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns>
        public static int CIGpuLayerCount
        {
            get
            {
                if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
                {
                    #if DEBUG
                      return 20;
                    #else
                      return 0;                      
                    #endif
                }
                else return 20;
            }
        }
    }
 }
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@@ -16,6 +16,7 @@ namespace LLama.Unittest
            {
                ContextSize = 2048,
                Seed = 92,
                GpuLayerCount = Constants.CIGpuLayerCount,                
            };
            _model = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -14,6 +14,7 @@ namespace LLama.Unittest
            var @params = new ModelParams(Constants.GenerativeModelPath)
            {
                ContextSize = 768,
                GpuLayerCount = Constants.CIGpuLayerCount,
            };
            _weights = LLamaWeights.LoadFromFile(@params);
            _context = _weights.CreateContext(@params);
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -1,32 +1,15 @@
 using LLama.Common;
 using LLama.Native;
 using LLama.Common;
 using Xunit.Abstractions;
 using Xunit.Sdk;

 namespace LLama.Unittest;

 public sealed class LLamaEmbedderTests
    : IDisposable
 {
    private readonly ITestOutputHelper _testOutputHelper;
    private readonly LLamaEmbedder _embedder;

    public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
    {
        _testOutputHelper = testOutputHelper;
        var @params = new ModelParams(Constants.EmbeddingModelPath)
        {
            ContextSize = 4096,
            Threads = 5,
            Embeddings = true,
        };
        using var weights = LLamaWeights.LoadFromFile(@params);
        _embedder = new(weights, @params);
    }

    public void Dispose()
    {
        _embedder.Dispose();
    }

    private static float Dot(float[] a, float[] b)
@@ -35,17 +18,25 @@ public sealed class LLamaEmbedderTests
        return a.Zip(b, (x, y) => x * y).Sum();
    }


    [Fact]
    public async Task EmbedCompare()
    private async Task CompareEmbeddings(string modelPath)
    {
        var cat = await _embedder.GetEmbeddings("The cat is cute");
        var @params = new ModelParams(modelPath)
        {
            ContextSize = 8,
            Threads = 4,
            Embeddings = true,
            GpuLayerCount = Constants.CIGpuLayerCount,
        };
        using var weights = LLamaWeights.LoadFromFile(@params);
        using var embedder = new LLamaEmbedder(weights, @params);

        var cat = await embedder.GetEmbeddings("The cat is cute");
        Assert.DoesNotContain(float.NaN, cat);

        var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
        var kitten = await embedder.GetEmbeddings("The kitten is kawaii");
        Assert.DoesNotContain(float.NaN, kitten);

        var spoon = await _embedder.GetEmbeddings("The spoon is not real");
        var spoon = await embedder.GetEmbeddings("The spoon is not real");
        Assert.DoesNotContain(float.NaN, spoon);

        _testOutputHelper.WriteLine($"Cat    = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
@@ -61,4 +52,16 @@ public sealed class LLamaEmbedderTests

        Assert.True(close < far);
    }

    [Fact]
    public async Task EmbedCompareEmbeddingModel()
    {
        await CompareEmbeddings(Constants.EmbeddingModelPath);
    }

    [Fact]
    public async Task EmbedCompareGenerateModel()
    {
        await CompareEmbeddings(Constants.GenerativeModelPath);
    }
 }
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
            var @params = new ModelParams(Constants.GenerativeModelPath)
            {
                // Llava models requires big context
                ContextSize = 4096
                ContextSize = 4096,
                GpuLayerCount = Constants.CIGpuLayerCount,                
            };
            _llamaWeights = LLamaWeights.LoadFromFile(@params);
            _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
@@ -32,7 +33,7 @@ namespace LLama.Unittest
            _lLavaWeights.Dispose();
        }
      
        [Fact(Skip = "Very very slow in CI")]
        [Fact,Trait("Category", "NoCI")]
        public void EmbedImageAsFileName()
        {
            int n_past = 0;
@@ -40,7 +41,7 @@ namespace LLama.Unittest
            Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
        }        
        
        [Fact(Skip = "Very very slow in CI")]
        [Fact,Trait("Category", "NoCI")]
        public void EmbedImageAsBinary()
        {
            int n_past = 0;
--- a/LLama.Unittest/MemoryDisposalTests.cs
+++ b/LLama.Unittest/MemoryDisposalTests.cs
@@ -9,7 +9,8 @@ public class MemoryDisposalTests
    {
        var @params = new ModelParams(Constants.GenerativeModelPath)
        {
            ContextSize = 2048
            ContextSize = 2048,
            GpuLayerCount = 0,
        };
        var model = LLamaWeights.LoadFromFile(@params);

@@ -23,7 +24,8 @@ public class MemoryDisposalTests
    {
        var @params = new ModelParams(Constants.GenerativeModelPath)
        {
            ContextSize = 2048
            ContextSize = 2048,
            GpuLayerCount = Constants.CIGpuLayerCount,            
        };
        var model = LLamaWeights.LoadFromFile(@params);

--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -20,6 +20,7 @@ namespace LLama.Unittest
                ContextSize = 60,
                Seed = 1754,
                BatchSize = 2,
                GpuLayerCount = Constants.CIGpuLayerCount,                
            };
            _weights = LLamaWeights.LoadFromFile(_params);
        }
--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@@ -14,7 +14,8 @@ public sealed class TokenTests
    {
        _params = new ModelParams(Constants.GenerativeModelPath)
        {
            ContextSize = 2048
            ContextSize = 2048,
            GpuLayerCount = Constants.CIGpuLayerCount,
        };
        _model = LLamaWeights.LoadFromFile(_params);
    }
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -97,11 +97,18 @@ namespace LLama

        private float[] GetEmbeddingsArray()
        {
            var embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
            if (embeddings.Length == 0)
                return Array.Empty<float>();
            unsafe
            {
                var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);

                if (embeddings == null)
                    embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);

            return embeddings.ToArray();
                if (embeddings == null)
                    return Array.Empty<float>();

                return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
            }
        }

        private static void Normalize(Span<float> embeddings)
@@ -112,6 +119,7 @@ namespace LLama
                lengthSqr += value * value;
            var length = (float)Math.Sqrt(lengthSqr);

            // Do not divide by length if it is zero
            if (length <= float.Epsilon)
                return;

--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@@ -38,8 +38,8 @@ namespace LLama
                                ILogger? logger = null)
            : base(context, logger)
        {
            _inp_pfx = Context.Tokenize(instructionPrefix, true);
            _inp_sfx = Context.Tokenize(instructionSuffix, false);
            _inp_pfx = Context.Tokenize(instructionPrefix, true, true);
            _inp_sfx = Context.Tokenize(instructionSuffix, false, true);
            _instructionPrefix = instructionPrefix;
        }

@@ -124,7 +124,7 @@ namespace LLama
            if (_is_prompt_run)
            {
                // When running the first input (prompt) in inteactive mode, we should specially process it.
                _embed_inps = Context.Tokenize(text, true).ToList();
                _embed_inps = Context.Tokenize(text, true, true).ToList();
            }
            else
            {
@@ -135,7 +135,7 @@ namespace LLama
                _consumedTokensCount = _embed_inps.Count;
                _embed_inps.AddRange(_inp_pfx);

                var line_inp = Context.Tokenize(text, false);
                var line_inp = Context.Tokenize(text, false, true);
                _embed_inps.AddRange(line_inp);

                _embed_inps.AddRange(_inp_sfx);
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -119,7 +119,7 @@ namespace LLama
                // When running the first input (prompt) in interactive mode, we should specially process it.
                if (!this.IsMultiModal)
                {
                    _embed_inps = Context.Tokenize(text, true).ToList();
                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                }
                else
                {
@@ -135,7 +135,7 @@ namespace LLama

                if (!this.IsMultiModal)
                {
                    var line_inp = Context.Tokenize(text, false);
                    var line_inp = Context.Tokenize(text, false, true);
                    _embed_inps.AddRange(line_inp);
                    args.RemainedTokens -= line_inp.Length;
                }
@@ -165,11 +165,11 @@ namespace LLama
                int imageIndex = text.IndexOf("<image>");
                // Tokenize segment 1 (before <image> tag)
                string preImagePrompt = text.Substring(0, imageIndex);
                var segment1 = Context.Tokenize(preImagePrompt, addBos );
                var segment1 = Context.Tokenize(preImagePrompt, addBos, true);
                // Remember the position to add the image embeddings
                _EmbedImagePosition = segment1.Length;
                string postImagePrompt = text.Substring(imageIndex + 7);
                var segment2 = Context.Tokenize(postImagePrompt, false);
                var segment2 = Context.Tokenize(postImagePrompt, false, true);
                _embed_inps.AddRange(segment1);
                _embed_inps.AddRange(segment2);
                usedTokens += (segment1.Length + segment2.Length);
@@ -178,11 +178,11 @@ namespace LLama
            {
                if (addBos)
                {
                    _embed_inps = Context.Tokenize(text, true).ToList();
                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                }
                else
                {
                    var line_inp = Context.Tokenize(text, false);
                    var line_inp = Context.Tokenize(text, false, true);
                    _embed_inps.AddRange(line_inp);
                    args.RemainedTokens -= line_inp.Length;                    
                }
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -90,7 +90,7 @@ namespace LLama
                lastTokens.Add(0);

            // Tokenize the prompt
            var tokens = Context.Tokenize(prompt).ToList();
            var tokens = Context.Tokenize(prompt, special: true).ToList();
            lastTokens.AddRange(tokens);

            // Evaluate the prompt, in chunks smaller than the max batch size
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -137,41 +137,17 @@ namespace LLama.Native
        /// Get the embeddings for the a specific sequence.
        /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
        /// </summary>
        /// <returns></returns>
        public static Span<float> llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id)
        {
            unsafe
            {
                var ptr = llama_get_embeddings_seq_native(ctx, id);
                if (ptr == null)
                    return Array.Empty<float>();

                return new Span<float>(ptr, ctx.EmbeddingSize);
            }

            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_seq")]
            static extern unsafe float* llama_get_embeddings_seq_native(SafeLLamaContextHandle ctx, LLamaSeqId id);
        }
        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern unsafe float* llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id);

        /// <summary>
        /// Get the embeddings for the ith sequence.
        /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
        /// </summary>
        /// <returns></returns>
        public static Span<float> llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i)
        {
            unsafe
            {
                var ptr = llama_get_embeddings_ith_native(ctx, i);
                if (ptr == null)
                    return Array.Empty<float>();

                return new Span<float>(ptr, ctx.EmbeddingSize);
            }

            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_ith")]
            static extern unsafe float* llama_get_embeddings_ith_native(SafeLLamaContextHandle ctx, int i);
        }
        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);

        /// <summary>
        /// Get all output token embeddings.
@@ -182,20 +158,8 @@ namespace LLama.Native
        /// </summary>
        /// <param name="ctx"></param>
        /// <returns></returns>
        public static Span<float> llama_get_embeddings(SafeLLamaContextHandle ctx)
        {
            unsafe
            {
                var ptr = llama_get_embeddings_native(ctx);
                if (ptr == null)
                    return Array.Empty<float>();

                return new Span<float>(ptr, ctx.EmbeddingSize);
            }

            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings")]
            static extern unsafe float* llama_get_embeddings_native(SafeLLamaContextHandle ctx);
        }
        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
        public static extern unsafe float* llama_get_embeddings(SafeLLamaContextHandle ctx);

        /// <summary>
        /// Apply chat template. Inspired by hf apply_chat_template() on python.