diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cf0c784c..aa0aefc9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        build: [linux-release, windows-release]
+        build: [linux-release, windows-release, osx-release]
         include:
           - build: linux-release
             os: ubuntu-latest
             config: release
+          - build: osx-release  
+            os: macos-14 # https://github.blog/changelog/2024-01-30-github-actions-introducing-the-new-m1-macos-runner-available-to-open-source/
+            config: release            
           - build: windows-release
             os: windows-2019
             config: release
@@ -27,8 +30,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: actions/setup-dotnet@v4
       with:
-        dotnet-version: | 
-          7.0.x
+        dotnet-version: |
           8.0.x
     - name: Cache Packages
       uses: actions/cache@v4
@@ -43,7 +45,7 @@ jobs:
     - name: Build
       run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
     - name: Test
-      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt
+      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }} -l "console;verbosity=detailed" --diag:logs/log.txt --filter Category!=NoCI
     - name: Upload artifacts
       if: always()
       uses: actions/upload-artifact@v3
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index d8c366bc..b72f49a0 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -104,6 +104,6 @@ namespace LLamaSharp.KernelMemory
         }
 
         /// <inheritdoc/>
-        public int CountTokens(string text) => _embedder.Context.Tokenize(text).Length;
+        public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index de6373ee..e3d18b3c 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -1,13 +1,7 @@
 ﻿using LLama;
-using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 
 namespace LLamaSharp.KernelMemory
 {
@@ -111,6 +105,6 @@ namespace LLamaSharp.KernelMemory
         }
 
         /// <inheritdoc/>
-        public int CountTokens(string text) => _context.Tokenize(text).Length;
+        public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
     }
 }
diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
index 7c897b78..1d54a7e9 100644
--- a/LLama.Unittest/BasicTest.cs
+++ b/LLama.Unittest/BasicTest.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
             _testOutputHelper = testOutputHelper;
             _params = new ModelParams(Constants.GenerativeModelPath)
             {
-                ContextSize = 2048
+                ContextSize = 2048,
+                GpuLayerCount = Constants.CIGpuLayerCount
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/BeamTests.cs b/LLama.Unittest/BeamTests.cs
index f4aa01ab..88b25672 100644
--- a/LLama.Unittest/BeamTests.cs
+++ b/LLama.Unittest/BeamTests.cs
@@ -17,7 +17,8 @@ public sealed class BeamTests
         _testOutputHelper = testOutputHelper;
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }
@@ -27,7 +28,6 @@ public sealed class BeamTests
         _model.Dispose();
     }
 
-    //[Fact(Skip = "Very very slow in CI")]
     [Fact]
     public void BasicBeam()
     {
diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs
index 6e5e92c5..4852a335 100644
--- a/LLama.Unittest/Constants.cs
+++ b/LLama.Unittest/Constants.cs
@@ -1,4 +1,6 @@
-﻿namespace LLama.Unittest
+﻿using System.Runtime.InteropServices;
+
+namespace LLama.Unittest
 {
     internal static class Constants
     {
@@ -8,5 +10,25 @@
         public static readonly string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
         public static readonly string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
         public static readonly string LLavaImage = "Models/extreme-ironing-taxi-610x427.jpg";
+
+        /// <summary>
+        /// Calculate GpuLayer Count to use in UnitTest
+        /// </summary>
+        /// <returns> Defaults to 20 in all the cases, except MacOS/OSX release (to disable METAL on github CI)</returns>
+        public static int CIGpuLayerCount
+        {
+            get
+            {
+                if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                {
+                    #if DEBUG
+                      return 20;
+                    #else
+                      return 0;                      
+                    #endif
+                }
+                else return 20;
+            }
+        }
     }
 }
diff --git a/LLama.Unittest/GrammarTest.cs b/LLama.Unittest/GrammarTest.cs
index 1ab9dea6..d4f6a95b 100644
--- a/LLama.Unittest/GrammarTest.cs
+++ b/LLama.Unittest/GrammarTest.cs
@@ -16,6 +16,7 @@ namespace LLama.Unittest
             {
                 ContextSize = 2048,
                 Seed = 92,
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _model = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
index fe247c6e..cc53e369 100644
--- a/LLama.Unittest/LLamaContextTests.cs
+++ b/LLama.Unittest/LLamaContextTests.cs
@@ -14,6 +14,7 @@ namespace LLama.Unittest
             var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 ContextSize = 768,
+                GpuLayerCount = Constants.CIGpuLayerCount,
             };
             _weights = LLamaWeights.LoadFromFile(@params);
             _context = _weights.CreateContext(@params);
diff --git a/LLama.Unittest/LLamaEmbedderTests.cs b/LLama.Unittest/LLamaEmbedderTests.cs
index 379b8dc6..e9d9359f 100644
--- a/LLama.Unittest/LLamaEmbedderTests.cs
+++ b/LLama.Unittest/LLamaEmbedderTests.cs
@@ -1,32 +1,15 @@
-﻿using LLama.Common;
-using LLama.Native;
+using LLama.Common;
 using Xunit.Abstractions;
-using Xunit.Sdk;
 
 namespace LLama.Unittest;
 
 public sealed class LLamaEmbedderTests
-    : IDisposable
 {
     private readonly ITestOutputHelper _testOutputHelper;
-    private readonly LLamaEmbedder _embedder;
 
     public LLamaEmbedderTests(ITestOutputHelper testOutputHelper)
     {
         _testOutputHelper = testOutputHelper;
-        var @params = new ModelParams(Constants.EmbeddingModelPath)
-        {
-            ContextSize = 4096,
-            Threads = 5,
-            Embeddings = true,
-        };
-        using var weights = LLamaWeights.LoadFromFile(@params);
-        _embedder = new(weights, @params);
-    }
-
-    public void Dispose()
-    {
-        _embedder.Dispose();
     }
 
     private static float Dot(float[] a, float[] b)
@@ -35,17 +18,25 @@ public sealed class LLamaEmbedderTests
         return a.Zip(b, (x, y) => x * y).Sum();
     }
 
-
-    [Fact]
-    public async Task EmbedCompare()
+    private async Task CompareEmbeddings(string modelPath)
     {
-        var cat = await _embedder.GetEmbeddings("The cat is cute");
+        var @params = new ModelParams(modelPath)
+        {
+            ContextSize = 8,
+            Threads = 4,
+            Embeddings = true,
+            GpuLayerCount = Constants.CIGpuLayerCount,
+        };
+        using var weights = LLamaWeights.LoadFromFile(@params);
+        using var embedder = new LLamaEmbedder(weights, @params);
+
+        var cat = await embedder.GetEmbeddings("The cat is cute");
         Assert.DoesNotContain(float.NaN, cat);
 
-        var kitten = await _embedder.GetEmbeddings("The kitten is kawaii");
+        var kitten = await embedder.GetEmbeddings("The kitten is kawaii");
         Assert.DoesNotContain(float.NaN, kitten);
 
-        var spoon = await _embedder.GetEmbeddings("The spoon is not real");
+        var spoon = await embedder.GetEmbeddings("The spoon is not real");
         Assert.DoesNotContain(float.NaN, spoon);
 
         _testOutputHelper.WriteLine($"Cat    = [{string.Join(",", cat.AsMemory().Slice(0, 7).ToArray())}...]");
@@ -61,4 +52,16 @@ public sealed class LLamaEmbedderTests
 
         Assert.True(close < far);
     }
+
+    [Fact]
+    public async Task EmbedCompareEmbeddingModel()
+    {
+        await CompareEmbeddings(Constants.EmbeddingModelPath);
+    }
+
+    [Fact]
+    public async Task EmbedCompareGenerateModel()
+    {
+        await CompareEmbeddings(Constants.GenerativeModelPath);
+    }
 }
\ No newline at end of file
diff --git a/LLama.Unittest/LLavaWeightsTests.cs b/LLama.Unittest/LLavaWeightsTests.cs
index e5df3073..30d41fd9 100644
--- a/LLama.Unittest/LLavaWeightsTests.cs
+++ b/LLama.Unittest/LLavaWeightsTests.cs
@@ -17,7 +17,8 @@ namespace LLama.Unittest
             var @params = new ModelParams(Constants.GenerativeModelPath)
             {
                 // Llava models requires big context
-                ContextSize = 4096
+                ContextSize = 4096,
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _llamaWeights = LLamaWeights.LoadFromFile(@params);
             _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
@@ -32,7 +33,7 @@ namespace LLama.Unittest
             _lLavaWeights.Dispose();
         }
       
-        [Fact(Skip = "Very very slow in CI")]
+        [Fact,Trait("Category", "NoCI")]
         public void EmbedImageAsFileName()
         {
             int n_past = 0;
@@ -40,7 +41,7 @@ namespace LLama.Unittest
             Assert.True( _lLavaWeights.EvalImageEmbed( _context, emb, ref n_past ) );
         }        
         
-        [Fact(Skip = "Very very slow in CI")]
+        [Fact,Trait("Category", "NoCI")]
         public void EmbedImageAsBinary()
         {
             int n_past = 0;
diff --git a/LLama.Unittest/MemoryDisposalTests.cs b/LLama.Unittest/MemoryDisposalTests.cs
index e29ad46d..60cd75fd 100644
--- a/LLama.Unittest/MemoryDisposalTests.cs
+++ b/LLama.Unittest/MemoryDisposalTests.cs
@@ -9,7 +9,8 @@ public class MemoryDisposalTests
     {
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = 0,
         };
         var model = LLamaWeights.LoadFromFile(@params);
 
@@ -23,7 +24,8 @@ public class MemoryDisposalTests
     {
         var @params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = Constants.CIGpuLayerCount,            
         };
         var model = LLamaWeights.LoadFromFile(@params);
 
diff --git a/LLama.Unittest/StatelessExecutorTest.cs b/LLama.Unittest/StatelessExecutorTest.cs
index 3ca8a76e..18f3c25d 100644
--- a/LLama.Unittest/StatelessExecutorTest.cs
+++ b/LLama.Unittest/StatelessExecutorTest.cs
@@ -20,6 +20,7 @@ namespace LLama.Unittest
                 ContextSize = 60,
                 Seed = 1754,
                 BatchSize = 2,
+                GpuLayerCount = Constants.CIGpuLayerCount,                
             };
             _weights = LLamaWeights.LoadFromFile(_params);
         }
diff --git a/LLama.Unittest/TokenTests.cs b/LLama.Unittest/TokenTests.cs
index c11e3ae9..03e3927f 100644
--- a/LLama.Unittest/TokenTests.cs
+++ b/LLama.Unittest/TokenTests.cs
@@ -14,7 +14,8 @@ public sealed class TokenTests
     {
         _params = new ModelParams(Constants.GenerativeModelPath)
         {
-            ContextSize = 2048
+            ContextSize = 2048,
+            GpuLayerCount = Constants.CIGpuLayerCount,
         };
         _model = LLamaWeights.LoadFromFile(_params);
     }
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index 13a3e1c2..f60f3cd5 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -97,11 +97,18 @@ namespace LLama
 
         private float[] GetEmbeddingsArray()
         {
-            var embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
-            if (embeddings.Length == 0)
-                return Array.Empty<float>();
+            unsafe
+            {
+                var embeddings = NativeApi.llama_get_embeddings(Context.NativeHandle);
+
+                if (embeddings == null)
+                    embeddings = NativeApi.llama_get_embeddings_seq(Context.NativeHandle, LLamaSeqId.Zero);
 
-            return embeddings.ToArray();
+                if (embeddings == null)
+                    return Array.Empty<float>();
+
+                return new Span<float>(embeddings, Context.EmbeddingSize).ToArray();
+            }
         }
 
         private static void Normalize(Span<float> embeddings)
@@ -112,6 +119,7 @@ namespace LLama
                 lengthSqr += value * value;
             var length = (float)Math.Sqrt(lengthSqr);
 
+            // Do not divide by length if it is zero
             if (length <= float.Epsilon)
                 return;
 
diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs
index c3a9a420..917dc5eb 100644
--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@@ -38,8 +38,8 @@ namespace LLama
                                 ILogger? logger = null)
             : base(context, logger)
         {
-            _inp_pfx = Context.Tokenize(instructionPrefix, true);
-            _inp_sfx = Context.Tokenize(instructionSuffix, false);
+            _inp_pfx = Context.Tokenize(instructionPrefix, true, true);
+            _inp_sfx = Context.Tokenize(instructionSuffix, false, true);
             _instructionPrefix = instructionPrefix;
         }
 
@@ -124,7 +124,7 @@ namespace LLama
             if (_is_prompt_run)
             {
                 // When running the first input (prompt) in inteactive mode, we should specially process it.
-                _embed_inps = Context.Tokenize(text, true).ToList();
+                _embed_inps = Context.Tokenize(text, true, true).ToList();
             }
             else
             {
@@ -135,7 +135,7 @@ namespace LLama
                 _consumedTokensCount = _embed_inps.Count;
                 _embed_inps.AddRange(_inp_pfx);
 
-                var line_inp = Context.Tokenize(text, false);
+                var line_inp = Context.Tokenize(text, false, true);
                 _embed_inps.AddRange(line_inp);
 
                 _embed_inps.AddRange(_inp_sfx);
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
index 5acf4bd3..9aaa1ca2 100644
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -119,7 +119,7 @@ namespace LLama
                 // When running the first input (prompt) in interactive mode, we should specially process it.
                 if (!this.IsMultiModal)
                 {
-                    _embed_inps = Context.Tokenize(text, true).ToList();
+                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                 }
                 else
                 {
@@ -135,7 +135,7 @@ namespace LLama
 
                 if (!this.IsMultiModal)
                 {
-                    var line_inp = Context.Tokenize(text, false);
+                    var line_inp = Context.Tokenize(text, false, true);
                     _embed_inps.AddRange(line_inp);
                     args.RemainedTokens -= line_inp.Length;
                 }
@@ -165,11 +165,11 @@ namespace LLama
                 int imageIndex = text.IndexOf("<image>");
                 // Tokenize segment 1 (before <image> tag)
                 string preImagePrompt = text.Substring(0, imageIndex);
-                var segment1 = Context.Tokenize(preImagePrompt, addBos );
+                var segment1 = Context.Tokenize(preImagePrompt, addBos, true);
                 // Remember the position to add the image embeddings
                 _EmbedImagePosition = segment1.Length;
                 string postImagePrompt = text.Substring(imageIndex + 7);
-                var segment2 = Context.Tokenize(postImagePrompt, false);
+                var segment2 = Context.Tokenize(postImagePrompt, false, true);
                 _embed_inps.AddRange(segment1);
                 _embed_inps.AddRange(segment2);
                 usedTokens += (segment1.Length + segment2.Length);
@@ -178,11 +178,11 @@ namespace LLama
             {
                 if (addBos)
                 {
-                    _embed_inps = Context.Tokenize(text, true).ToList();
+                    _embed_inps = Context.Tokenize(text, true, true).ToList();
                 }
                 else
                 {
-                    var line_inp = Context.Tokenize(text, false);
+                    var line_inp = Context.Tokenize(text, false, true);
                     _embed_inps.AddRange(line_inp);
                     args.RemainedTokens -= line_inp.Length;                    
                 }
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 487fe293..a3c52a02 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -90,7 +90,7 @@ namespace LLama
                 lastTokens.Add(0);
 
             // Tokenize the prompt
-            var tokens = Context.Tokenize(prompt).ToList();
+            var tokens = Context.Tokenize(prompt, special: true).ToList();
             lastTokens.AddRange(tokens);
 
             // Evaluate the prompt, in chunks smaller than the max batch size
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 6f8d142e..ed456151 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -137,41 +137,17 @@ namespace LLama.Native
         /// Get the embeddings for the a specific sequence.
         /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
         /// </summary>
-        /// <returns></returns>
-        public static Span<float> llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_seq_native(ctx, id);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_seq")]
-            static extern unsafe float* llama_get_embeddings_seq_native(SafeLLamaContextHandle ctx, LLamaSeqId id);
-        }
+        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings_seq(SafeLLamaContextHandle ctx, LLamaSeqId id);
 
         /// <summary>
         /// Get the embeddings for the ith sequence.
         /// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
         /// </summary>
-        /// <returns></returns>
-        public static Span<float> llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_ith_native(ctx, i);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings_ith")]
-            static extern unsafe float* llama_get_embeddings_ith_native(SafeLLamaContextHandle ctx, int i);
-        }
+        /// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings_ith(SafeLLamaContextHandle ctx, int i);
 
         /// <summary>
         /// Get all output token embeddings.
@@ -182,20 +158,8 @@ namespace LLama.Native
         /// </summary>
         /// <param name="ctx"></param>
         /// <returns></returns>
-        public static Span<float> llama_get_embeddings(SafeLLamaContextHandle ctx)
-        {
-            unsafe
-            {
-                var ptr = llama_get_embeddings_native(ctx);
-                if (ptr == null)
-                    return Array.Empty<float>();
-
-                return new Span<float>(ptr, ctx.EmbeddingSize);
-            }
-
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_get_embeddings")]
-            static extern unsafe float* llama_get_embeddings_native(SafeLLamaContextHandle ctx);
-        }
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe float* llama_get_embeddings(SafeLLamaContextHandle ctx);
 
         /// <summary>
         /// Apply chat template. Inspired by hf apply_chat_template() on python.