From f16aa58e12dfede92611c4c1dd93a8f8826c13b1 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 01:18:12 +0100
Subject: [PATCH 01/11] Updated to use the new loading system in llama
 (llama_state). This new system has split model weights and contexts into two
 separate things, allowing one set of weights to be shared between many
 contexts.

This change _only_ implements the low level API and makes no effort to update the LlamaSharp higher level abstraction.

It is built upon llama `b3f138d`, necessary DLLs are **not** included in this commit.
---
 LLama/Native/LLamaContextParams.cs     | 36 ++++++++++++++++++
 LLama/Native/NativeApi.cs              | 21 ++++++++---
 LLama/Native/SafeLLamaContextHandle.cs | 51 ++++++++++++++++++++++----
 LLama/Native/SafeLlamaModelHandle.cs   | 31 ++++++++++++++++
 LLama/OldVersion/Utils.cs              | 12 ++----
 LLama/Utils.cs                         | 21 +++++------
 6 files changed, 138 insertions(+), 34 deletions(-)
 create mode 100644 LLama/Native/SafeLlamaModelHandle.cs
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 3d0e2cab..6412409e 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -13,65 +13,101 @@ namespace LLama.Native
         /// RNG seed, -1 for random
         /// </summary>
         public int seed;
+
         /// <summary>
         /// text context
         /// </summary>
         public int n_ctx;
+
         /// <summary>
         /// prompt processing batch size
         /// </summary>
         public int n_batch;
+
+        /// <summary>
+        /// grouped-query attention (TEMP - will be moved to model hparams)
+        /// </summary>
+        public int n_gqa;
+
+        /// <summary>
+        /// rms norm epsilon (TEMP - will be moved to model hparams)
+        /// </summary>
+        float rms_norm_eps;
+
         /// <summary>
         /// number of layers to store in VRAM
         /// </summary>
         public int n_gpu_layers;
+
         /// <summary>
         /// the GPU that is used for scratch and small tensors
         /// </summary>
         public int main_gpu;
+
         /// <summary>
         /// how to split layers across multiple GPUs
         /// </summary>
         public TensorSplits tensor_split;
+
+        /// <summary>
+        /// ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        /// RoPE base frequency
+        /// </summary>
+        float rope_freq_base;
+
+        /// <summary>
+        /// ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        /// RoPE frequency scaling factor
+        /// </summary>
+        float rope_freq_scale; 
+
         /// <summary>
         /// called with a progress value between 0 and 1, pass NULL to disable
         /// </summary>
         public IntPtr progress_callback;
+
         /// <summary>
         /// context pointer passed to the progress callback
         /// </summary>
         public IntPtr progress_callback_user_data;
 
+
         /// <summary>
         /// if true, reduce VRAM usage at the cost of performance
         /// </summary>
         [MarshalAs(UnmanagedType.I1)]
         public bool low_vram;
+
         /// <summary>
         /// use fp16 for KV cache
         /// </summary>
         [MarshalAs(UnmanagedType.I1)]
         public bool f16_kv;
+
         /// <summary>
         /// the llama_eval() call computes all logits, not just the last one
         /// </summary>
         [MarshalAs(UnmanagedType.I1)]
         public bool logits_all;
+
         /// <summary>
         /// only load the vocabulary, no weights
         /// </summary>
         [MarshalAs(UnmanagedType.I1)] 
         public bool vocab_only;
+
         /// <summary>
         /// use mmap if possible
         /// </summary>
         [MarshalAs(UnmanagedType.I1)] 
         public bool use_mmap;
+
         /// <summary>
         /// force system to keep model in RAM
         /// </summary>
         [MarshalAs(UnmanagedType.I1)] 
         public bool use_mlock;
+
         /// <summary>
         /// embedding mode only
         /// </summary>
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index b4d23007..629fe3f6 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -1,6 +1,4 @@
 ﻿using System;
-using System.Collections.Generic;
-using System.IO;
 using System.Runtime.InteropServices;
 using System.Text;
 using LLama.Exceptions;
@@ -29,7 +27,7 @@ namespace LLama.Native
         }
         private const string libraryName = "libllama";
 
-        [DllImport("libllama", EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)]
+        [DllImport(libraryName, EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)]
         public static extern bool llama_empty_call();
 
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
@@ -56,7 +54,10 @@ namespace LLama.Native
         /// <param name="params_"></param>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern IntPtr llama_init_from_file(string path_model, LLamaContextParams params_);
+        public static extern IntPtr llama_load_model_from_file(string path_model, LLamaContextParams params_);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams params_);
 
         /// <summary>
         /// not great API - very likely to change. 
@@ -65,6 +66,7 @@ namespace LLama.Native
         /// </summary>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_backend_init(bool numa);
+
         /// <summary>
         /// Frees all allocated memory
         /// </summary>
@@ -72,6 +74,13 @@ namespace LLama.Native
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern void llama_free(IntPtr ctx);
 
+        /// <summary>
+        /// Frees all allocated memory associated with a model
+        /// </summary>
+        /// <param name="model"></param>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern void llama_free_model(IntPtr model);
+        
         /// <summary>
         /// Apply a LoRA adapter to a loaded model
         /// path_base_model is the path to a higher quality model to use as a base for
@@ -79,13 +88,13 @@ namespace LLama.Native
         /// The model needs to be reloaded before applying a new adapter, otherwise the adapter
         /// will be applied on top of the previous one
         /// </summary>
-        /// <param name="ctx"></param>
+        /// <param name="model_ptr"></param>
         /// <param name="path_lora"></param>
         /// <param name="path_base_model"></param>
         /// <param name="n_threads"></param>
         /// <returns>Returns 0 on success</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern int llama_apply_lora_from_file(SafeLLamaContextHandle ctx, string path_lora, string path_base_model, int n_threads);
+        public static extern int llama_model_apply_lora_from_file(SafeLlamaModelHandle model_ptr, string path_lora, string? path_base_model, int n_threads);
 
         /// <summary>
         /// Returns the number of tokens in the KV cache
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 5c26cb13..ab102228 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -1,26 +1,61 @@
 ﻿using System;
-using System.Collections.Generic;
-using System.Runtime.InteropServices;
-using System.Text;
+using LLama.Exceptions;
 
 namespace LLama.Native
 {
-    public class SafeLLamaContextHandle: SafeLLamaHandleBase
+    /// <summary>
+    /// A safe wrapper around a llama_context
+    /// </summary>
+    public class SafeLLamaContextHandle
+        : SafeLLamaHandleBase
     {
-        protected SafeLLamaContextHandle()
-        {
-        }
+        /// <summary>
+        /// This field guarantees that a reference to the model is held for as long as this handle is held
+        /// </summary>
+        private SafeLlamaModelHandle? _model;
 
-        public SafeLLamaContextHandle(IntPtr handle)
+        /// <summary>
+        /// Create a new SafeLLamaContextHandle
+        /// </summary>
+        /// <param name="handle">pointer to an allocated llama_context</param>
+        /// <param name="model">the model which this context was created from</param>
+        public SafeLLamaContextHandle(IntPtr handle, SafeLlamaModelHandle model)
             : base(handle)
         {
+            // Increment the model reference count while this context exists
+            _model = model;
+            var success = false;
+            _model.DangerousAddRef(ref success);
+            if (!success)
+                throw new RuntimeError("Failed to increment model refcount");
         }
 
+        /// <inheritdoc />
         protected override bool ReleaseHandle()
         {
+            // Decrement refcount on model
+            _model?.DangerousRelease();
+            _model = null;
+
             NativeApi.llama_free(handle);
             SetHandle(IntPtr.Zero);
             return true;
         }
+
+        /// <summary>
+        /// Create a new llama_state for the given model
+        /// </summary>
+        /// <param name="model"></param>
+        /// <param name="lparams"></param>
+        /// <returns></returns>
+        /// <exception cref="RuntimeError"></exception>
+        public static SafeLLamaContextHandle Create(SafeLlamaModelHandle model, LLamaContextParams lparams)
+        {
+            var ctx_ptr = NativeApi.llama_new_context_with_model(model, lparams);
+            if (ctx_ptr == IntPtr.Zero)
+                throw new RuntimeError("Failed to create context from model");
+
+            return new(ctx_ptr, model);
+        }
     }
 }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
new file mode 100644
index 00000000..7448efa1
--- /dev/null
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -0,0 +1,31 @@
+﻿using System;
+using LLama.Exceptions;
+
+namespace LLama.Native
+{
+    public class SafeLlamaModelHandle
+        : SafeLLamaHandleBase
+    {
+        public SafeLlamaModelHandle(IntPtr handle)
+            : base(handle)
+        {
+        }
+
+        /// <inheritdoc />
+        protected override bool ReleaseHandle()
+        {
+            NativeApi.llama_free_model(handle);
+            SetHandle(IntPtr.Zero);
+            return true;
+        }
+
+        public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextParams lparams)
+        {
+            var model_ptr = NativeApi.llama_load_model_from_file(modelPath, lparams);
+            if (model_ptr == null)
+                throw new RuntimeError($"Failed to load model {modelPath}.");
+
+            return new SafeLlamaModelHandle(model_ptr);
+        }
+    }
+}
diff --git a/LLama/OldVersion/Utils.cs b/LLama/OldVersion/Utils.cs
index 4916a20d..646ce365 100644
--- a/LLama/OldVersion/Utils.cs
+++ b/LLama/OldVersion/Utils.cs
@@ -31,18 +31,12 @@ namespace LLama.OldVersion
                 throw new FileNotFoundException($"The model file does not exist: {@params.model}");
             }
 
-            var ctx_ptr = NativeApi.llama_init_from_file(@params.model, lparams);
-
-            if (ctx_ptr == IntPtr.Zero)
-            {
-                throw new RuntimeError($"Failed to load model {@params.model}.");
-            }
-
-            SafeLLamaContextHandle ctx = new(ctx_ptr);
+            var model = SafeLlamaModelHandle.LoadFromFile(@params.model, lparams);
+            var ctx = SafeLLamaContextHandle.Create(model, lparams);
 
             if (!string.IsNullOrEmpty(@params.lora_adapter))
             {
-                int err = NativeApi.llama_apply_lora_from_file(ctx, @params.lora_adapter,
+                int err = NativeApi.llama_model_apply_lora_from_file(model, @params.lora_adapter,
                     string.IsNullOrEmpty(@params.lora_base) ? null : @params.lora_base, @params.n_threads);
                 if (err != 0)
                 {
diff --git a/LLama/Utils.cs b/LLama/Utils.cs
index b6f1b7b4..8ee084ec 100644
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -44,19 +44,18 @@ namespace LLama
                 throw new FileNotFoundException($"The model file does not exist: {@params.ModelPath}");
             }
 
-            var ctx_ptr = NativeApi.llama_init_from_file(@params.ModelPath, lparams);
-
-            if (ctx_ptr == IntPtr.Zero)
-            {
-                throw new RuntimeError($"Failed to load model {@params.ModelPath}.");
-            }
-
-            SafeLLamaContextHandle ctx = new(ctx_ptr);
+            var model = SafeLlamaModelHandle.LoadFromFile(@params.ModelPath, lparams);
+            var ctx = SafeLLamaContextHandle.Create(model, lparams);
 
             if (!string.IsNullOrEmpty(@params.LoraAdapter))
             {
-                int err = NativeApi.llama_apply_lora_from_file(ctx, @params.LoraAdapter,
-                    string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase, @params.Threads);
+                var err = NativeApi.llama_model_apply_lora_from_file(
+                    model,
+                    @params.LoraAdapter,
+                    string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase,
+                    @params.Threads
+                );
+
                 if (err != 0)
                 {
                     throw new RuntimeError("Failed to apply lora adapter.");
@@ -78,7 +77,7 @@ namespace LLama
             return res.Take(n);
         }
 
-        public unsafe static Span<float> GetLogits(SafeLLamaContextHandle ctx, int length)
+        public static unsafe Span<float> GetLogits(SafeLLamaContextHandle ctx, int length)
         {
             var logits = NativeApi.llama_get_logits(ctx);
             return new Span<float>(logits, length);

From c95b14d8b3f0cc182c210f72e355223499e3f325 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 16:23:25 +0100
Subject: [PATCH 02/11] - Fixed null check  - Additional comments

---
 LLama/Native/SafeLLamaHandleBase.cs  | 10 +++++++---
 LLama/Native/SafeLlamaModelHandle.cs | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/LLama/Native/SafeLLamaHandleBase.cs b/LLama/Native/SafeLLamaHandleBase.cs
index 023f8cdd..6371b327 100644
--- a/LLama/Native/SafeLLamaHandleBase.cs
+++ b/LLama/Native/SafeLLamaHandleBase.cs
@@ -1,11 +1,13 @@
 ﻿using System;
-using System.Collections.Generic;
 using System.Runtime.InteropServices;
-using System.Text;
 
 namespace LLama.Native
 {
-    public abstract class SafeLLamaHandleBase: SafeHandle
+    /// <summary>
+    /// Base class for all llama handles to native resources
+    /// </summary>
+    public abstract class SafeLLamaHandleBase
+        : SafeHandle
     {
         private protected SafeLLamaHandleBase()
             : base(IntPtr.Zero, ownsHandle: true)
@@ -24,8 +26,10 @@ namespace LLama.Native
             SetHandle(handle);
         }
 
+        /// <inheritdoc />
         public override bool IsInvalid => handle == IntPtr.Zero;
 
+        /// <inheritdoc />
         public override string ToString()
             => $"0x{handle.ToString("x16")}";
     }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 7448efa1..5607ccee 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -3,10 +3,13 @@ using LLama.Exceptions;
 
 namespace LLama.Native
 {
+    /// <summary>
+    /// A reference to a set of llama model weights
+    /// </summary>
     public class SafeLlamaModelHandle
         : SafeLLamaHandleBase
     {
-        public SafeLlamaModelHandle(IntPtr handle)
+        internal SafeLlamaModelHandle(IntPtr handle)
             : base(handle)
         {
         }
@@ -19,10 +22,17 @@ namespace LLama.Native
             return true;
         }
 
+        /// <summary>
+        /// Load a model from the given file path into memory
+        /// </summary>
+        /// <param name="modelPath"></param>
+        /// <param name="lparams"></param>
+        /// <returns></returns>
+        /// <exception cref="RuntimeError"></exception>
         public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextParams lparams)
         {
             var model_ptr = NativeApi.llama_load_model_from_file(modelPath, lparams);
-            if (model_ptr == null)
+            if (model_ptr == IntPtr.Zero)
                 throw new RuntimeError($"Failed to load model {modelPath}.");
 
             return new SafeLlamaModelHandle(model_ptr);

From 44b1e93609b6bc21a97e5fffa947c64cc690a3e2 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 16:35:24 +0100
Subject: [PATCH 03/11] Moved LoRA loading into `SafeLlamaModelHandle`

---
 LLama/Native/SafeLlamaModelHandle.cs | 21 +++++++++++++++++++++
 LLama/OldVersion/Utils.cs            | 10 ++--------
 LLama/Utils.cs                       | 13 +------------
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 5607ccee..4e3b8044 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -37,5 +37,26 @@ namespace LLama.Native
 
             return new SafeLlamaModelHandle(model_ptr);
         }
+
+        /// <summary>
+        /// Apply a LoRA adapter to a loaded model
+        /// </summary>
+        /// <param name="lora"></param>
+        /// <param name="modelBase">A path to a higher quality model to use as a base for the layers modified by the
+        /// adapter. Can be NULL to use the current loaded model.</param>
+        /// <param name="threads"></param>
+        /// <exception cref="RuntimeError"></exception>
+        public void ApplyLoraFromFile(string lora, string? modelBase = null, int threads = -1)
+        {
+            var err = NativeApi.llama_model_apply_lora_from_file(
+                this,
+                lora,
+                string.IsNullOrEmpty(modelBase) ? null : modelBase,
+                threads
+            );
+
+            if (err != 0)
+                throw new RuntimeError("Failed to apply lora adapter.");
+        }
     }
 }
diff --git a/LLama/OldVersion/Utils.cs b/LLama/OldVersion/Utils.cs
index 646ce365..df8adddd 100644
--- a/LLama/OldVersion/Utils.cs
+++ b/LLama/OldVersion/Utils.cs
@@ -35,14 +35,8 @@ namespace LLama.OldVersion
             var ctx = SafeLLamaContextHandle.Create(model, lparams);
 
             if (!string.IsNullOrEmpty(@params.lora_adapter))
-            {
-                int err = NativeApi.llama_model_apply_lora_from_file(model, @params.lora_adapter,
-                    string.IsNullOrEmpty(@params.lora_base) ? null : @params.lora_base, @params.n_threads);
-                if (err != 0)
-                {
-                    throw new RuntimeError("Failed to apply lora adapter.");
-                }
-            }
+                model.ApplyLoraFromFile(@params.lora_adapter, @params.lora_base, @params.n_threads);
+
             return ctx;
         }
 
diff --git a/LLama/Utils.cs b/LLama/Utils.cs
index 8ee084ec..c2dbf7aa 100644
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -48,19 +48,8 @@ namespace LLama
             var ctx = SafeLLamaContextHandle.Create(model, lparams);
 
             if (!string.IsNullOrEmpty(@params.LoraAdapter))
-            {
-                var err = NativeApi.llama_model_apply_lora_from_file(
-                    model,
-                    @params.LoraAdapter,
-                    string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase,
-                    @params.Threads
-                );
+                model.ApplyLoraFromFile(@params.LoraAdapter, @params.LoraBase, @params.Threads);
 
-                if (err != 0)
-                {
-                    throw new RuntimeError("Failed to apply lora adapter.");
-                }
-            }
             return ctx;
         }
 

From b721072aa5318a81f2ddf1f924760d92f650fe55 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 16:41:17 +0100
Subject: [PATCH 04/11] Exposed some extra model properties on safe handle

---
 LLama/Native/NativeApi.cs            |  9 +++++++++
 LLama/Native/SafeLlamaModelHandle.cs | 12 ++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 629fe3f6..5218d55c 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -303,5 +303,14 @@ namespace LLama.Native
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr llama_print_system_info();
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
     }
 }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 4e3b8044..e047c8fe 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -9,9 +9,21 @@ namespace LLama.Native
     public class SafeLlamaModelHandle
         : SafeLLamaHandleBase
     {
+        /// <summary>
+        /// Total number of tokens in vocabulary of this model
+        /// </summary>
+        public int VocabCount { get; set; }
+
+        public int ContextSize { get; set; }
+
+        public int EmbeddingCount { get; set; }
+
         internal SafeLlamaModelHandle(IntPtr handle)
             : base(handle)
         {
+            VocabCount = NativeApi.llama_n_vocab_from_model(this);
+            ContextSize = NativeApi.llama_n_ctx_from_model(this);
+            EmbeddingCount = NativeApi.llama_n_embd_from_model(this);
         }
 
         /// <inheritdoc />

From 369c915afe5fa3ae048023cbc0f36a37c945dfe2 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 16:55:04 +0100
Subject: [PATCH 05/11] Added TokenToString conversion on model handle

---
 LLama/Native/NativeApi.cs            |  3 +++
 LLama/Native/SafeLlamaModelHandle.cs | 39 ++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 5218d55c..ed6b0e5a 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -312,5 +312,8 @@ namespace LLama.Native
 
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle safeLlamaModelHandle, int llamaToken);
     }
 }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index e047c8fe..939fc57d 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -1,4 +1,6 @@
 ﻿using System;
+using System.Drawing;
+using System.Text;
 using LLama.Exceptions;
 
 namespace LLama.Native
@@ -70,5 +72,42 @@ namespace LLama.Native
             if (err != 0)
                 throw new RuntimeError("Failed to apply lora adapter.");
         }
+
+        /// <summary>
+        /// Convert a single llama token into string bytes
+        /// </summary>
+        /// <param name="llama_token"></param>
+        /// <returns></returns>
+        public ReadOnlySpan<byte> TokenToSpan(int llama_token)
+        {
+            unsafe
+            {
+                var bytes = new ReadOnlySpan<byte>(NativeApi.llama_token_to_str_with_model(this, llama_token), int.MaxValue);
+                var terminator = bytes.IndexOf((byte)0);
+                return bytes.Slice(0, terminator);
+            }
+        }
+
+        /// <summary>
+        /// Convert a single llama token into a string
+        /// </summary>
+        /// <param name="llama_token"></param>
+        /// <param name="encoding">Encoding to use to decode the bytes into a string</param>
+        /// <returns></returns>
+        public string TokenToString(int llama_token, Encoding encoding)
+        {
+            var span = TokenToSpan(llama_token);
+
+            if (span.Length == 0)
+                return "";
+
+            unsafe
+            {
+                fixed (byte* ptr = &span[0])
+                {
+                    return encoding.GetString(ptr, span.Length);
+                }
+            }
+        }
     }
 }

From afb9d24f3abee38932afdf2609625cad8812077f Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 20:29:35 +0100
Subject: [PATCH 06/11] Added model `Tokenize` method

---
 LLama/Native/NativeApi.cs            |  5 +++-
 LLama/Native/SafeLlamaModelHandle.cs | 43 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index ed6b0e5a..527bea52 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -314,6 +314,9 @@ namespace LLama.Native
         public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model);
 
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle safeLlamaModelHandle, int llamaToken);
+        public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle model, int llamaToken);
+
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern int llama_tokenize_with_model(SafeLlamaModelHandle model, byte* text, int* tokens, int n_max_tokens, bool add_bos);
     }
 }
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 939fc57d..d1e6c230 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -52,6 +52,7 @@ namespace LLama.Native
             return new SafeLlamaModelHandle(model_ptr);
         }
 
+        #region LoRA
         /// <summary>
         /// Apply a LoRA adapter to a loaded model
         /// </summary>
@@ -72,7 +73,9 @@ namespace LLama.Native
             if (err != 0)
                 throw new RuntimeError("Failed to apply lora adapter.");
         }
+        #endregion
 
+        #region tokenize
         /// <summary>
         /// Convert a single llama token into string bytes
         /// </summary>
@@ -109,5 +112,45 @@ namespace LLama.Native
                 }
             }
         }
+
+        /// <summary>
+        /// Convert a string of text into tokens
+        /// </summary>
+        /// <param name="text"></param>
+        /// <param name="add_bos"></param>
+        /// <param name="encoding"></param>
+        /// <returns></returns>
+        public int[] Tokenize(string text, bool add_bos, Encoding encoding)
+        {
+            // Convert string to bytes, adding one extra byte to the end (null terminator)
+            var bytesCount = encoding.GetByteCount(text);
+            var bytes = new byte[bytesCount + 1];
+            unsafe
+            {
+                fixed (char* charPtr = text)
+                fixed (byte* bytePtr = &bytes[0])
+                {
+                    encoding.GetBytes(charPtr, text.Length, bytePtr, bytes.Length);
+                }
+            }
+
+            unsafe
+            {
+                fixed (byte* bytesPtr = &bytes[0])
+                {
+                    // Tokenize once with no output, to get the token count. Output will be negative (indicating that there was insufficient space)
+                    var count = -NativeApi.llama_tokenize_with_model(this, bytesPtr, (int*)IntPtr.Zero, 0, add_bos);
+
+                    // Tokenize again, this time outputting into an array of exactly the right size
+                    var tokens = new int[count];
+                    fixed (int* tokensPtr = &tokens[0])
+                    {
+                        count = NativeApi.llama_tokenize_with_model(this, bytesPtr, tokensPtr, count, add_bos);
+                        return tokens;
+                    }
+                }
+            }
+        }
+        #endregion
     }
 }

From c974c8429e3bf71deeaf1446fbd7e6737af1952a Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 25 Jul 2023 20:30:10 +0100
Subject: [PATCH 07/11] Removed leftover `using`

---
 LLama/Native/SafeLlamaModelHandle.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index d1e6c230..64ba9073 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -1,5 +1,4 @@
 ﻿using System;
-using System.Drawing;
 using System.Text;
 using LLama.Exceptions;
 

From 6985d3ab60264c2af14258b60db350f82028e94b Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Thu, 27 Jul 2023 18:58:29 +0100
Subject: [PATCH 08/11] Added comments on two properties

---
 LLama/Native/SafeLlamaModelHandle.cs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 64ba9073..79714fea 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -15,8 +15,14 @@ namespace LLama.Native
         /// </summary>
         public int VocabCount { get; set; }
 
+        /// <summary>
+        /// Total number of tokens in the context
+        /// </summary>
         public int ContextSize { get; set; }
 
+        /// <summary>
+        /// Dimension of embedding vectors
+        /// </summary>
         public int EmbeddingCount { get; set; }
 
         internal SafeLlamaModelHandle(IntPtr handle)

From 3e252c81f66ca326574d5a795301732ef6bba97e Mon Sep 17 00:00:00 2001
From: sa_ddam213 <sa_ddam213@live.com>
Date: Fri, 28 Jul 2023 19:15:19 +1200
Subject: [PATCH 09/11] LLamaContextParams epsilon and tensor split changes

---
 LLama/Native/LLamaContextParams.cs | 8 ++++----
 LLama/Utils.cs                     | 9 +++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 6412409e..42f2be3f 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -32,7 +32,7 @@ namespace LLama.Native
         /// <summary>
         /// rms norm epsilon (TEMP - will be moved to model hparams)
         /// </summary>
-        float rms_norm_eps;
+       public float rms_norm_eps;
 
         /// <summary>
         /// number of layers to store in VRAM
@@ -47,19 +47,19 @@ namespace LLama.Native
         /// <summary>
         /// how to split layers across multiple GPUs
         /// </summary>
-        public TensorSplits tensor_split;
+        public float[] tensor_split;
 
         /// <summary>
         /// ref: https://github.com/ggerganov/llama.cpp/pull/2054
         /// RoPE base frequency
         /// </summary>
-        float rope_freq_base;
+        public float rope_freq_base;
 
         /// <summary>
         /// ref: https://github.com/ggerganov/llama.cpp/pull/2054
         /// RoPE frequency scaling factor
         /// </summary>
-        float rope_freq_scale; 
+        public float rope_freq_scale; 
 
         /// <summary>
         /// called with a progress value between 0 and 1, pass NULL to disable
diff --git a/LLama/Utils.cs b/LLama/Utils.cs
index c2dbf7aa..c08912cf 100644
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -28,16 +28,13 @@ namespace LLama
             lparams.logits_all = @params.Perplexity;
             lparams.embedding = @params.EmbeddingMode;
             lparams.low_vram = @params.LowVram;
-
-            if(@params.TensorSplits.Length != 1)
+       
+            if (@params.TensorSplits.Length != 1)
             {
                 throw new ArgumentException("Currently multi-gpu support is not supported by " +
                     "both llama.cpp and LLamaSharp.");
             }
-            lparams.tensor_split = new TensorSplits()
-            {
-                Item1 = @params.TensorSplits[0]
-            };
+            lparams.tensor_split = @params.TensorSplits;
 
             if (!File.Exists(@params.ModelPath))
             {

From 2245b8490661156a02790f93f5b971a0565db7ce Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Wed, 2 Aug 2023 23:13:07 +0100
Subject: [PATCH 10/11] Update LLamaContextParams.cs

---
 LLama/Native/LLamaContextParams.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 42f2be3f..58233ba5 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -47,6 +47,7 @@ namespace LLama.Native
         /// <summary>
         /// how to split layers across multiple GPUs
         /// </summary>
+        [MarshalAs(UnmanagedType.LPArray)]
         public float[] tensor_split;
 
         /// <summary>

From add3d5528b7bdda66617af6d6545f9e5272c6615 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Thu, 3 Aug 2023 14:16:41 +0100
Subject: [PATCH 11/11] Removed `MarshalAs` on array

---
 LLama/Native/LLamaContextParams.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 58233ba5..42f2be3f 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -47,7 +47,6 @@ namespace LLama.Native
         /// <summary>
         /// how to split layers across multiple GPUs
         /// </summary>
-        [MarshalAs(UnmanagedType.LPArray)]
         public float[] tensor_split;
 
         /// <summary>