From 44c393db1ee7388d9812c768737e09e673b0f6ac Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Wed, 17 Apr 2024 21:44:30 +0200
Subject: [PATCH 01/15] Reintroduce MacOS on test builds

Using the new M1 macos runner:

https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
---
 .github/workflows/main.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cf0c784c..c584729e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        build: [linux-release, windows-release]
+        build: [linux-release, windows-release, osx-release]
         include:
           - build: linux-release
             os: ubuntu-latest
             config: release
+          - build: osx-release  
+            os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
+            config: release            
           - build: windows-release
             os: windows-2019
             config: release

From 330e38553e2b85107be9b8b5ad2c7495282fe84e Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Wed, 17 Apr 2024 22:11:52 +0200
Subject: [PATCH 02/15] .NET 6

---
 .github/workflows/main.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c584729e..3d942fa1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,8 @@ jobs:
     - uses: actions/checkout@v4
     - uses: actions/setup-dotnet@v4
       with:
-        dotnet-version: | 
+        dotnet-version: |
+          6.0.x
           7.0.x
           8.0.x
     - name: Cache Packages

From 74bde89a6187963577d4563200339b717da6918f Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Wed, 17 Apr 2024 22:24:29 +0200
Subject: [PATCH 03/15] Test to disable metal on test

---
 LLama.Unittest/BasicTest.cs             | 3 ++-
 LLama.Unittest/BeamTests.cs             | 3 ++-
 LLama.Unittest/LLamaContextTests.cs     | 1 +
 LLama.Unittest/LLamaEmbedderTests.cs    | 1 +
 LLama.Unittest/LLavaWeightsTests.cs     | 3 ++-
 LLama.Unittest/StatelessExecutorTest.cs | 1 +
 LLama.Unittest/TokenTests.cs            | 3 ++-
 7 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
index 7c897b78..64fa4d0f 100644
--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
             _testOutputHelper = testOutputHelper;
             _params = new ModelParams(Constants.GenerativeModelPath)
             {
-                ContextSize = 2048
+                ContextSize = 2048,
+                GpuLayerCount = 0
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs
index f4aa01ab..86399c42 100644
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@@ -17,7 +17,8 @@ public sealed class BeamTests
         _testOutputHelper = testOutputHelper;
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = 0,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index fe247c6e..bd02ef7b 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -14,6 +14,7 @@ namespace LLama.Unittest
             var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 768,
+                GpuLayerCount = 0
             };
             _weights = LLamaWeights.LoadFromFile(@params);
             _context = _weights.CreateContext(@params);
diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 379b8dc6..604e2bd1 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -19,6 +19,7 @@ public sealed class LLamaEmbedderTests
             ContextSize = 4096,
             Threads = 5,
             Embeddings = true,
+            GpuLayerCount = 0
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _embedder = new(weights, @params);
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
index e5df3073..8eaf81bc 100644
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
             var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 // Llava models requires big context
-                ContextSize = 4096
+                ContextSize = 4096,
+                GpuLayerCount = 0                
             };
             _llamaWeights = LLamaWeights.LoadFromFile(@params);
             _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
index 3ca8a76e..ab979471 100644
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -20,6 +20,7 @@ namespace LLama.Unittest
                 ContextSize = 60,
                 Seed = 1754,
                 BatchSize = 2,
+                GpuLayerCount = 0,                
             };
             _weights = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
index c11e3ae9..19118bf8 100644
--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@@ -14,7 +14,8 @@ public sealed class TokenTests
     {
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = 0,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }

From cbe0c0ef3eacf8387bacf92aee82476ae74759e3 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Wed, 17 Apr 2024 22:37:08 +0200
Subject: [PATCH 04/15] Disable metal

---
 LLama.Unittest/GrammarTest.cs         | 1 +
 LLama.Unittest/MemoryDisposalTests.cs | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/LLama.Unittest/GrammarTest.cs b/LLama.Unittest/GrammarTest.cs
index 1ab9dea6..db4372b0 100644
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@@ -16,6 +16,7 @@ namespace LLama.Unittest
             {
                 ContextSize = 2048,
                 Seed = 92,
+                GpuLayerCount = 0,                
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/MemoryDisposalTests.cs b/LLama.Unittest/MemoryDisposalTests.cs
index e29ad46d..1bcc7edc 100644
--- a/LLama.Unittest/MemoryDisposalTests.cs
+++ b/LLama.Unittest/MemoryDisposalTests.cs
@@ -9,7 +9,8 @@ public class MemoryDisposalTests
     {
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = 0,
         };
         var model = LLamaWeights.LoadFromFile(@params);
 
@@ -23,7 +24,8 @@ public class MemoryDisposalTests
     {
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = 0            
         };
         var model = LLamaWeights.LoadFromFile(@params);
 

From bb5d7e189d15d77b0538db45698f532b1ae26067 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Wed, 17 Apr 2024 23:03:31 +0200
Subject: [PATCH 05/15] Create an Specific Fact attribute to disable some test
 on CI and execute that test on Debug

---
 LLama.Unittest/BeamTests.cs         |  1 -
 LLama.Unittest/IgnoreOnCIFact.cs    | 22 ++++++++++++++++++++++
 LLama.Unittest/LLavaWeightsTests.cs |  4 ++--
 3 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 LLama.Unittest/IgnoreOnCIFact.cs

diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs
index 86399c42..6ae0adbf 100644
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@@ -28,7 +28,6 @@ public sealed class BeamTests
         _model.Dispose();
     }
 
-    //[Fact(Skip = "Very very slow in CI")]
     [Fact]
     public void BasicBeam()
     {
diff --git a/LLama.Unittest/IgnoreOnCIFact.cs b/LLama.Unittest/IgnoreOnCIFact.cs
new file mode 100644
index 00000000..c26bfd6c
--- /dev/null
+++ b/LLama.Unittest/IgnoreOnCIFact.cs
@@ -0,0 +1,22 @@
+namespace LLama.Unittest;
+
+/// <summary>
+/// Extend Fact attributes to know if we are running on release or debug. The assumption is that on CI we run on Release
+/// </summary>
+public class IgnoreOnCiFact : FactAttribute
+{
+    public IgnoreOnCiFact() {
+        if( IsRelease()) {
+            Skip = "Ignore on CI";
+        }
+    }
+
+    private static bool IsRelease()
+    {
+        #if DEBUG
+            return false;
+        #else
+            return true;
+        #endif
+    }
+}
\ No newline at end of file
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
index 8eaf81bc..bf75f022 100644
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -33,7 +33,7 @@ namespace LLama.Unittest
             _lLavaWeights.Dispose();
         }
       
-        [Fact(Skip = "Very very slow in CI")]
+        [IgnoreOnCiFact]
         public void EmbedImageAsFileName()
         {
             int n_past = 0;
@@ -41,7 +41,7 @@ namespace LLama.Unittest
             Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
         }        
         
-        [Fact(Skip = "Very very slow in CI")]
+        [IgnoreOnCiFact]
         public void EmbedImageAsBinary()
         {
             int n_past = 0;

From 89fbbc0f51ab63dc0c177c0c55e025565fc2172d Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Thu, 18 Apr 2024 06:13:10 +0200
Subject: [PATCH 06/15] Restore previous dotnet-versions

---
 .github/workflows/main.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3d942fa1..c51db230 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,7 +31,6 @@ jobs:
     - uses: actions/setup-dotnet@v4
       with:
         dotnet-version: |
-          6.0.x
           7.0.x
           8.0.x
     - name: Cache Packages

From 75cad1f3a59eb608300ebb82392fc840a1d742ab Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Thu, 18 Apr 2024 06:36:09 +0200
Subject: [PATCH 07/15] Remove .NET7 on test

---
 .github/workflows/main.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c51db230..59757066 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,7 +31,6 @@ jobs:
     - uses: actions/setup-dotnet@v4
       with:
         dotnet-version: |
-          7.0.x
           8.0.x
     - name: Cache Packages
       uses: actions/cache@v4

From e6b7141188530fcf52992c06f03e00be8811a16d Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Thu, 18 Apr 2024 21:29:37 +0200
Subject: [PATCH 08/15] Change attribute to a Filter to disable test on CI

---
 .github/workflows/main.yml          |  2 +-
 LLama.Unittest/IgnoreOnCIFact.cs    | 22 ----------------------
 LLama.Unittest/LLavaWeightsTests.cs |  4 ++--
 3 files changed, 3 insertions(+), 25 deletions(-)
 delete mode 100644 LLama.Unittest/IgnoreOnCIFact.cs

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 59757066..aa0aefc9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Build
       run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
     - name: Test
-      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
+      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI
     - name: Upload artifacts
       if: always()
       uses: actions/upload-artifact@v3
diff --git a/LLama.Unittest/IgnoreOnCIFact.cs b/LLama.Unittest/IgnoreOnCIFact.cs
deleted file mode 100644
index c26bfd6c..00000000
--- a/LLama.Unittest/IgnoreOnCIFact.cs
+++ /dev/null
@@ -1,22 +0,0 @@
-namespace LLama.Unittest;
-
-/// <summary>
-/// Extend Fact attributes to know if we are running on release or debug. The assumption is that on CI we run on Release
-/// </summary>
-public class IgnoreOnCiFact : FactAttribute
-{
-    public IgnoreOnCiFact() {
-        if( IsRelease()) {
-            Skip = "Ignore on CI";
-        }
-    }
-
-    private static bool IsRelease()
-    {
-        #if DEBUG
-            return false;
-        #else
-            return true;
-        #endif
-    }
-}
\ No newline at end of file
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
index bf75f022..2c21a7a1 100644
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -33,7 +33,7 @@ namespace LLama.Unittest
             _lLavaWeights.Dispose();
         }
       
-        [IgnoreOnCiFact]
+        [Fact,Trait("Category", "NoCI")]
         public void EmbedImageAsFileName()
         {
             int n_past = 0;
@@ -41,7 +41,7 @@ namespace LLama.Unittest
             Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
         }        
         
-        [IgnoreOnCiFact]
+        [Fact,Trait("Category", "NoCI")]
         public void EmbedImageAsBinary()
         {
             int n_past = 0;

From 53ae90487517552a4e1e832cf0e6e2286f1e4789 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Thu, 18 Apr 2024 22:03:47 +0200
Subject: [PATCH 09/15] Set GPULayerCount to execute the Test

Set GPULayerCount to default value (20) to execute UnitTest. In the case of Release Execution on MacOS set the value to ZERO to disable METAL on MacOS and be able to execute it in CI.
---
 LLama.Unittest/BasicTest.cs             |  2 +-
 LLama.Unittest/BeamTests.cs             |  2 +-
 LLama.Unittest/Constants.cs             | 24 +++++++++++++++++++++++-
 LLama.Unittest/GrammarTest.cs           |  2 +-
 LLama.Unittest/LLamaContextTests.cs     |  2 +-
 LLama.Unittest/LLamaEmbedderTests.cs    |  2 +-
 LLama.Unittest/LLavaWeightsTests.cs     |  2 +-
 LLama.Unittest/MemoryDisposalTests.cs   |  2 +-
 LLama.Unittest/StatelessExecutorTest.cs |  2 +-
 LLama.Unittest/TokenTests.cs            |  2 +-
 10 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
index 64fa4d0f..1d54a7e9 100644
--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -18,7 +18,7 @@ namespace LLama.Unittest
             _params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 2048,
-                GpuLayerCount = 0
+                GpuLayerCount = Constants.CIGpuLayerCount
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs
index 6ae0adbf..88b25672 100644
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@@ -18,7 +18,7 @@ public sealed class BeamTests
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048,
-            GpuLayerCount = 0,
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }
diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs
index 6e5e92c5..8a496f6e 100644
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@@ -1,4 +1,6 @@
-﻿namespace LLama.Unittest
+﻿using System.Runtime.InteropServices;
+
+namespace LLama.Unittest
 {
     internal static class Constants
     {
@@ -8,5 +10,25 @@
         public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
         public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
         public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
+
+        /// <summary>
+        /// Calculate GpuLayer Count to use in UnitTest
+        /// </summary>
+        /// <returns> Defaults to 20 in all the cases, except IOS release (to disable METAL on github CI)</returns>
+        public static int CIGpuLayerCount
+        {
+            get
+            {
+                if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                {
+                    #if DEBUG
+                      return 20;
+                    #else
+                      return 0;                      
+                    #endif
+                }
+                else return 20;
+            }
+        }
     }
 }
diff --git a/LLama.Unittest/GrammarTest.cs b/LLama.Unittest/GrammarTest.cs
index db4372b0..d4f6a95b 100644
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@@ -16,7 +16,7 @@ namespace LLama.Unittest
             {
                 ContextSize = 2048,
                 Seed = 92,
-                GpuLayerCount = 0,                
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index bd02ef7b..cc53e369 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -14,7 +14,7 @@ namespace LLama.Unittest
             var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 768,
-                GpuLayerCount = 0
+                GpuLayerCount = Constants.CIGpuLayerCount,
             };
             _weights = LLamaWeights.LoadFromFile(@params);
             _context = _weights.CreateContext(@params);
diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 604e2bd1..570c6c12 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -19,7 +19,7 @@ public sealed class LLamaEmbedderTests
             ContextSize = 4096,
             Threads = 5,
             Embeddings = true,
-            GpuLayerCount = 0
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _embedder = new(weights, @params);
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
index 2c21a7a1..30d41fd9 100644
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -18,7 +18,7 @@ namespace LLama.Unittest
             {
                 // Llava models requires big context
                 ContextSize = 4096,
-                GpuLayerCount = 0                
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _llamaWeights = LLamaWeights.LoadFromFile(@params);
             _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
diff --git a/LLama.Unittest/MemoryDisposalTests.cs b/LLama.Unittest/MemoryDisposalTests.cs
index 1bcc7edc..60cd75fd 100644
--- a/LLama.Unittest/MemoryDisposalTests.cs
+++ b/LLama.Unittest/MemoryDisposalTests.cs
@@ -25,7 +25,7 @@ public class MemoryDisposalTests
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048,
-            GpuLayerCount = 0            
+            GpuLayerCount = Constants.CIGpuLayerCount,            
         };
         var model = LLamaWeights.LoadFromFile(@params);
 
diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
index ab979471..18f3c25d 100644
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -20,7 +20,7 @@ namespace LLama.Unittest
                 ContextSize = 60,
                 Seed = 1754,
                 BatchSize = 2,
-                GpuLayerCount = 0,                
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _weights = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
index 19118bf8..03e3927f 100644
--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@@ -15,7 +15,7 @@ public sealed class TokenTests
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
             ContextSize = 2048,
-            GpuLayerCount = 0,
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }

From 49f437f3ec6c9df926061b220cca3a8bffef858f Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Fri, 19 Apr 2024 06:52:58 +0200
Subject: [PATCH 10/15] Typo on comment. Disable Metal on MacOS / OSX

---
 LLama.Unittest/Constants.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs
index 8a496f6e..4852a335 100644
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@@ -14,7 +14,7 @@ namespace LLama.Unittest
         /// <summary>
         /// Calculate GpuLayer Count to use in UnitTest
         /// </summary>
-        /// <returns> Defaults to 20 in all the cases, except IOS release (to disable METAL on github CI)</returns>
+        /// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns>
         public static int CIGpuLayerCount
         {
             get

From 89217f73caa16786fedacb33a0d441ed905d87ff Mon Sep 17 00:00:00 2001
From: Zoli Somogyi <zsomogyi.be@gmail.com>
Date: Fri, 19 Apr 2024 17:23:44 +0200
Subject: [PATCH 11/15] Embeddings correction (#674)

* Embeddings correction
---
 LLama/LLamaEmbedder.cs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index 13a3e1c2..c29b6b25 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -97,9 +97,13 @@ namespace LLama
 
         private float[] GetEmbeddingsArray()
         {
-            var embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
-            if (embeddings.Length == 0)
-                return Array.Empty<float>();
+            var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
+            if (embeddings == null || embeddings.Length == 0)
+            {
+                embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
+                if (embeddings == null || embeddings.Length == 0)
+                    return Array.Empty<float>();
+            }
 
             return embeddings.ToArray();
         }

From 3c764409579a417a85048984fb492b9530ef97d3 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Fri, 19 Apr 2024 16:30:32 +0100
Subject: [PATCH 12/15] - Added tests for generating embeddings with generative
 model and embedding model  - Rewritten native API methods for embeddings to
 return pointers - null is a valid value for these methods to return so `Span`
 is not appropriate

---
 LLama.Unittest/LLamaEmbedderTests.cs | 49 ++++++++++++++------------
 LLama/LLamaEmbedder.cs               | 16 +++++----
 LLama/Native/NativeApi.cs            | 52 +++++-----------------------
 3 files changed, 44 insertions(+), 73 deletions(-)

diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 379b8dc6..31d6199b 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -1,32 +1,16 @@
 ﻿using LLama.Common;
-using LLama.Native;
 using Xunit.Abstractions;
-using Xunit.Sdk;
 
 namespace LLama.Unittest;
 
 public sealed class LLamaEmbedderTests
-    : IDisposable
 {
     private readonly ITestOutputHelper _testOutputHelper;
-    private readonly LLamaEmbedder _embedder;
 
     public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-        var @params = new ModelParams(Constants.EmbeddingModelPath)
-        {
-            ContextSize = 4096,
-            Threads = 5,
-            Embeddings = true,
-        };
-        using var weights = LLamaWeights.LoadFromFile(@params);
-        _embedder = new(weights, @params);
-    }
-
-    public void Dispose()
-    {
-        _embedder.Dispose();
+        
     }
 
     private static float Dot(float[] a, float[] b)
@@ -35,17 +19,24 @@ public sealed class LLamaEmbedderTests
         return a.Zip(b, (x, y) => x * y).Sum();
     }
 
-
-    [Fact]
-    public async Task EmbedCompare()
+    private async Task CompareEmbeddings(string modelPath)
     {
-        var cat = await _embedder.GetEmbeddings("The cat is cute");
+        var @params = new ModelParams(modelPath)
+        {
+            ContextSize = 8,
+            Threads = 4,
+            Embeddings = true,
+        };
+        using var weights = LLamaWeights.LoadFromFile(@params);
+        using var embedder = new LLamaEmbedder(weights, @params);
+
+        var cat = await embedder.GetEmbeddings("The cat is cute");
         Assert.DoesNotContain(float.NaN, cat);
 
-        var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
+        var kitten = await embedder.GetEmbeddings("The kitten is kawaii");
         Assert.DoesNotContain(float.NaN, kitten);
 
-        var spoon = await _embedder.GetEmbeddings("The spoon is not real");
+        var spoon = await embedder.GetEmbeddings("The spoon is not real");
         Assert.DoesNotContain(float.NaN, spoon);
 
         _testOutputHelper.WriteLine($"Cat    = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
@@ -61,4 +52,16 @@ public sealed class LLamaEmbedderTests
 
         Assert.True(close < far);
     }
+
+    [Fact]
+    public async Task EmbedCompareEmbeddingModel()
+    {
+        await CompareEmbeddings(Constants.EmbeddingModelPath);
+    }
+
+    [Fact]
+    public async Task EmbedCompareGenerateModel()
+    {
+        await CompareEmbeddings(Constants.GenerativeModelPath);
+    }
 }
\ No newline at end of file
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index c29b6b25..f60f3cd5 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -97,15 +97,18 @@ namespace LLama
 
         private float[] GetEmbeddingsArray()
         {
-            var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
-            if (embeddings == null || embeddings.Length == 0)
+            unsafe
             {
-                embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
-                if (embeddings == null || embeddings.Length == 0)
+                var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
+
+                if (embeddings == null)
+                    embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
+
+                if (embeddings == null)
                     return Array.Empty<float>();
-            }
 
-            return embeddings.ToArray();
+                return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
+            }
         }
 
         private static void Normalize(Span<float> embeddings)
@@ -116,6 +119,7 @@ namespace LLama
                 lengthSqr += value * value;
             var length = (float)Math.Sqrt(lengthSqr);
 
+            // Do not divide by length if it is zero
             if (length <= float.Epsilon)
                 return;
 
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 6f8d142e..ed456151 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -137,41 +137,17 @@ namespace LLama.Native
         /// Get the embeddings for the a specific sequence.
         /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
         /// </summary>
-        /// <returns></returns>
-        public static Span<float> llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_seq_native(ctx, id);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_seq")]
-            static extern unsafe float* llama_get_embeddings_seq_native(SafeLLamaContextHandle ctx, LLamaSeqId id);
-        }
+        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id);
 
         /// <summary>
         /// Get the embeddings for the ith sequence.
         /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
         /// </summary>
-        /// <returns></returns>
-        public static Span<float> llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_ith_native(ctx, i);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_ith")]
-            static extern unsafe float* llama_get_embeddings_ith_native(SafeLLamaContextHandle ctx, int i);
-        }
+        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);
 
         /// <summary>
         /// Get all output token embeddings.
@@ -182,20 +158,8 @@ namespace LLama.Native
         /// </summary>
         /// <param name="ctx"></param>
         /// <returns></returns>
-        public static Span<float> llama_get_embeddings(SafeLLamaContextHandle ctx)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_native(ctx);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings")]
-            static extern unsafe float* llama_get_embeddings_native(SafeLLamaContextHandle ctx);
-        }
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings(SafeLLamaContextHandle ctx);
 
         /// <summary>
         /// Apply chat template. Inspired by hf apply_chat_template() on python.

From 550f2f7684f5b2b171b90f9f442a3c36fb79e6ea Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Fri, 19 Apr 2024 18:31:14 +0100
Subject: [PATCH 13/15] Fixed build due to changes in unit tests

---
 LLama.Unittest/LLamaEmbedderTests.cs | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 4c2bc8ab..43de46f9 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -10,21 +10,6 @@ public sealed class LLamaEmbedderTests
     public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-
-        var @params = new ModelParams(Constants.EmbeddingModelPath)
-        {
-            ContextSize = 4096,
-            Threads = 5,
-            Embeddings = true,
-            GpuLayerCount = Constants.CIGpuLayerCount,
-        };
-        using var weights = LLamaWeights.LoadFromFile(@params);
-        _embedder = new(weights, @params);
-    }
-
-    public void Dispose()
-    {
-        _embedder.Dispose();
     }
 
     private static float Dot(float[] a, float[] b)

From b416966ec4e9c2abd10e2735efadebf44b7cb544 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Fri, 19 Apr 2024 19:38:58 +0200
Subject: [PATCH 14/15] Disable Metal on CI UnitTest

---
 LLama.Unittest/LLamaEmbedderTests.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 43de46f9..e9d9359f 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -25,6 +25,7 @@ public sealed class LLamaEmbedderTests
             ContextSize = 8,
             Threads = 4,
             Embeddings = true,
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         using var embedder = new LLamaEmbedder(weights, @params);

From f01c13ee54c5829180d546dd407fddf25e86cbe0 Mon Sep 17 00:00:00 2001
From: Lyrcaxis <32474602+Lyrcaxis@users.noreply.github.com>
Date: Sat, 20 Apr 2024 17:23:55 +0300
Subject: [PATCH 15/15] Made special tokens included in prompts tokenize as
 intended (#677)

---
 .../LLamaSharpTextEmbeddingGenerator.cs              |  2 +-
 LLama.KernelMemory/LlamaSharpTextGenerator.cs        |  8 +-------
 LLama/LLamaInstructExecutor.cs                       |  8 ++++----
 LLama/LLamaInteractExecutor.cs                       | 12 ++++++------
 LLama/LLamaStatelessExecutor.cs                      |  2 +-
 5 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index d8c366bc..b72f49a0 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory
         }
 
         /// <inheritdoc/>
-        public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length;
+        public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index de6373ee..e3d18b3c 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -1,13 +1,7 @@
 ﻿using LLama;
-using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 
 namespace LLamaSharp.KernelMemory
 {
@@ -111,6 +105,6 @@ namespace LLamaSharp.KernelMemory
         }
 
         /// <inheritdoc/>
-        public int CountTokens(string text) => _context.Tokenize(text).Length;
+        public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
     }
 }
diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs
index c3a9a420..917dc5eb 100644
--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@@ -38,8 +38,8 @@ namespace LLama
                                 ILogger? logger = null)
             : base(context, logger)
         {
-            _inp_pfx = Context.Tokenize(instructionPrefix, true);
-            _inp_sfx = Context.Tokenize(instructionSuffix, false);
+            _inp_pfx = Context.Tokenize(instructionPrefix, true, true);
+            _inp_sfx = Context.Tokenize(instructionSuffix, false, true);
             _instructionPrefix = instructionPrefix;
         }
 
@@ -124,7 +124,7 @@ namespace LLama
             if (_is_prompt_run)
             {
                 // When running the first input (prompt) in inteactive mode, we should specially process it.
-                _embed_inps = Context.Tokenize(text, true).ToList();
+                _embed_inps = Context.Tokenize(text, true, true).ToList();
             }
             else
             {
@@ -135,7 +135,7 @@ namespace LLama
                 _consumedTokensCount = _embed_inps.Count;
                 _embed_inps.AddRange(_inp_pfx);
 
-                var line_inp = Context.Tokenize(text, false);
+                var line_inp = Context.Tokenize(text, false, true);
                 _embed_inps.AddRange(line_inp);
 
                 _embed_inps.AddRange(_inp_sfx);
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
index 5acf4bd3..9aaa1ca2 100644
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -119,7 +119,7 @@ namespace LLama
                 // When running the first input (prompt) in interactive mode, we should specially process it.
                 if (!this.IsMultiModal)
                 {
-                    _embed_inps = Context.Tokenize(text, true).ToList();
+                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                 }
                 else
                 {
@@ -135,7 +135,7 @@ namespace LLama
 
                 if (!this.IsMultiModal)
                 {
-                    var line_inp = Context.Tokenize(text, false);
+                    var line_inp = Context.Tokenize(text, false, true);
                     _embed_inps.AddRange(line_inp);
                     args.RemainedTokens -= line_inp.Length;
                 }
@@ -165,11 +165,11 @@ namespace LLama
                 int imageIndex = text.IndexOf("<image>");
                 // Tokenize segment 1 (before <image> tag)
                 string preImagePrompt = text.Substring(0, imageIndex);
-                var segment1 = Context.Tokenize(preImagePrompt, addBos );
+                var segment1 = Context.Tokenize(preImagePrompt, addBos, true);
                 // Remember the position to add the image embeddings
                 _EmbedImagePosition = segment1.Length;
                 string postImagePrompt = text.Substring(imageIndex + 7);
-                var segment2 = Context.Tokenize(postImagePrompt, false);
+                var segment2 = Context.Tokenize(postImagePrompt, false, true);
                 _embed_inps.AddRange(segment1);
                 _embed_inps.AddRange(segment2);
                 usedTokens += (segment1.Length + segment2.Length);
@@ -178,11 +178,11 @@ namespace LLama
             {
                 if (addBos)
                 {
-                    _embed_inps = Context.Tokenize(text, true).ToList();
+                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                 }
                 else
                 {
-                    var line_inp = Context.Tokenize(text, false);
+                    var line_inp = Context.Tokenize(text, false, true);
                     _embed_inps.AddRange(line_inp);
                     args.RemainedTokens -= line_inp.Length;                    
                 }
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 487fe293..a3c52a02 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -90,7 +90,7 @@ namespace LLama
                 lastTokens.Add(0);
 
             // Tokenize the prompt
-            var tokens = Context.Tokenize(prompt).ToList();
+            var tokens = Context.Tokenize(prompt, special: true).ToList();
             lastTokens.AddRange(tokens);
 
             // Evaluate the prompt, in chunks smaller than the max batch size