From f16aa58e12dfede92611c4c1dd93a8f8826c13b1 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 01:18:12 +0100 Subject: [PATCH 01/11] Updated to use the new loading system in llama (llama_state). This new system has split model weights and contexts into two separate things, allowing one set of weights to be shared between many contexts. This change _only_ implements the low level API and makes no effort to update the LlamaSharp higher level abstraction. It is built upon llama `b3f138d`, necessary DLLs are **not** included in this commit. --- LLama/Native/LLamaContextParams.cs | 36 ++++++++++++++++++ LLama/Native/NativeApi.cs | 21 ++++++++--- LLama/Native/SafeLLamaContextHandle.cs | 51 ++++++++++++++++++++++---- LLama/Native/SafeLlamaModelHandle.cs | 31 ++++++++++++++++ LLama/OldVersion/Utils.cs | 12 ++---- LLama/Utils.cs | 21 +++++------ 6 files changed, 138 insertions(+), 34 deletions(-) create mode 100644 LLama/Native/SafeLlamaModelHandle.cs diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 3d0e2cab..6412409e 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -13,65 +13,101 @@ namespace LLama.Native /// RNG seed, -1 for random /// public int seed; + /// /// text context /// public int n_ctx; + /// /// prompt processing batch size /// public int n_batch; + + /// + /// grouped-query attention (TEMP - will be moved to model hparams) + /// + public int n_gqa; + + /// + /// rms norm epsilon (TEMP - will be moved to model hparams) + /// + float rms_norm_eps; + /// /// number of layers to store in VRAM /// public int n_gpu_layers; + /// /// the GPU that is used for scratch and small tensors /// public int main_gpu; + /// /// how to split layers across multiple GPUs /// public TensorSplits tensor_split; + + /// + /// ref: https://github.com/ggerganov/llama.cpp/pull/2054 + /// RoPE base frequency + /// + float rope_freq_base; + + /// + /// ref: https://github.com/ggerganov/llama.cpp/pull/2054 + /// RoPE frequency scaling factor + /// + float rope_freq_scale; + /// /// called with a progress value between 0 and 1, pass NULL to disable /// public IntPtr progress_callback; + /// /// context pointer passed to the progress callback /// public IntPtr progress_callback_user_data; + /// /// if true, reduce VRAM usage at the cost of performance /// [MarshalAs(UnmanagedType.I1)] public bool low_vram; + /// /// use fp16 for KV cache /// [MarshalAs(UnmanagedType.I1)] public bool f16_kv; + /// /// the llama_eval() call computes all logits, not just the last one /// [MarshalAs(UnmanagedType.I1)] public bool logits_all; + /// /// only load the vocabulary, no weights /// [MarshalAs(UnmanagedType.I1)] public bool vocab_only; + /// /// use mmap if possible /// [MarshalAs(UnmanagedType.I1)] public bool use_mmap; + /// /// force system to keep model in RAM /// [MarshalAs(UnmanagedType.I1)] public bool use_mlock; + /// /// embedding mode only /// diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index b4d23007..629fe3f6 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -1,6 +1,4 @@ using System; -using System.Collections.Generic; -using System.IO; using System.Runtime.InteropServices; using System.Text; using LLama.Exceptions; @@ -29,7 +27,7 @@ namespace LLama.Native } private const string libraryName = "libllama"; - [DllImport("libllama", EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)] + [DllImport(libraryName, EntryPoint = "llama_mmap_supported", CallingConvention = CallingConvention.Cdecl)] public static extern bool llama_empty_call(); [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] @@ -56,7 +54,10 @@ namespace LLama.Native /// /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern IntPtr llama_init_from_file(string path_model, LLamaContextParams params_); + public static extern IntPtr llama_load_model_from_file(string path_model, LLamaContextParams params_); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern IntPtr llama_new_context_with_model(SafeLlamaModelHandle model, LLamaContextParams params_); /// /// not great API - very likely to change. @@ -65,6 +66,7 @@ namespace LLama.Native /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_backend_init(bool numa); + /// /// Frees all allocated memory /// @@ -72,6 +74,13 @@ namespace LLama.Native [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_free(IntPtr ctx); + /// + /// Frees all allocated memory associated with a model + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern void llama_free_model(IntPtr model); + /// /// Apply a LoRA adapter to a loaded model /// path_base_model is the path to a higher quality model to use as a base for @@ -79,13 +88,13 @@ namespace LLama.Native /// The model needs to be reloaded before applying a new adapter, otherwise the adapter /// will be applied on top of the previous one /// - /// + /// /// /// /// /// Returns 0 on success [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern int llama_apply_lora_from_file(SafeLLamaContextHandle ctx, string path_lora, string path_base_model, int n_threads); + public static extern int llama_model_apply_lora_from_file(SafeLlamaModelHandle model_ptr, string path_lora, string? path_base_model, int n_threads); /// /// Returns the number of tokens in the KV cache diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 5c26cb13..ab102228 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -1,26 +1,61 @@ using System; -using System.Collections.Generic; -using System.Runtime.InteropServices; -using System.Text; +using LLama.Exceptions; namespace LLama.Native { - public class SafeLLamaContextHandle: SafeLLamaHandleBase + /// + /// A safe wrapper around a llama_context + /// + public class SafeLLamaContextHandle + : SafeLLamaHandleBase { - protected SafeLLamaContextHandle() - { - } + /// + /// This field guarantees that a reference to the model is held for as long as this handle is held + /// + private SafeLlamaModelHandle? _model; - public SafeLLamaContextHandle(IntPtr handle) + /// + /// Create a new SafeLLamaContextHandle + /// + /// pointer to an allocated llama_context + /// the model which this context was created from + public SafeLLamaContextHandle(IntPtr handle, SafeLlamaModelHandle model) : base(handle) { + // Increment the model reference count while this context exists + _model = model; + var success = false; + _model.DangerousAddRef(ref success); + if (!success) + throw new RuntimeError("Failed to increment model refcount"); } + /// protected override bool ReleaseHandle() { + // Decrement refcount on model + _model?.DangerousRelease(); + _model = null; + NativeApi.llama_free(handle); SetHandle(IntPtr.Zero); return true; } + + /// + /// Create a new llama_state for the given model + /// + /// + /// + /// + /// + public static SafeLLamaContextHandle Create(SafeLlamaModelHandle model, LLamaContextParams lparams) + { + var ctx_ptr = NativeApi.llama_new_context_with_model(model, lparams); + if (ctx_ptr == IntPtr.Zero) + throw new RuntimeError("Failed to create context from model"); + + return new(ctx_ptr, model); + } } } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs new file mode 100644 index 00000000..7448efa1 --- /dev/null +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -0,0 +1,31 @@ +using System; +using LLama.Exceptions; + +namespace LLama.Native +{ + public class SafeLlamaModelHandle + : SafeLLamaHandleBase + { + public SafeLlamaModelHandle(IntPtr handle) + : base(handle) + { + } + + /// + protected override bool ReleaseHandle() + { + NativeApi.llama_free_model(handle); + SetHandle(IntPtr.Zero); + return true; + } + + public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextParams lparams) + { + var model_ptr = NativeApi.llama_load_model_from_file(modelPath, lparams); + if (model_ptr == null) + throw new RuntimeError($"Failed to load model {modelPath}."); + + return new SafeLlamaModelHandle(model_ptr); + } + } +} diff --git a/LLama/OldVersion/Utils.cs b/LLama/OldVersion/Utils.cs index 4916a20d..646ce365 100644 --- a/LLama/OldVersion/Utils.cs +++ b/LLama/OldVersion/Utils.cs @@ -31,18 +31,12 @@ namespace LLama.OldVersion throw new FileNotFoundException($"The model file does not exist: {@params.model}"); } - var ctx_ptr = NativeApi.llama_init_from_file(@params.model, lparams); - - if (ctx_ptr == IntPtr.Zero) - { - throw new RuntimeError($"Failed to load model {@params.model}."); - } - - SafeLLamaContextHandle ctx = new(ctx_ptr); + var model = SafeLlamaModelHandle.LoadFromFile(@params.model, lparams); + var ctx = SafeLLamaContextHandle.Create(model, lparams); if (!string.IsNullOrEmpty(@params.lora_adapter)) { - int err = NativeApi.llama_apply_lora_from_file(ctx, @params.lora_adapter, + int err = NativeApi.llama_model_apply_lora_from_file(model, @params.lora_adapter, string.IsNullOrEmpty(@params.lora_base) ? null : @params.lora_base, @params.n_threads); if (err != 0) { diff --git a/LLama/Utils.cs b/LLama/Utils.cs index b6f1b7b4..8ee084ec 100644 --- a/LLama/Utils.cs +++ b/LLama/Utils.cs @@ -44,19 +44,18 @@ namespace LLama throw new FileNotFoundException($"The model file does not exist: {@params.ModelPath}"); } - var ctx_ptr = NativeApi.llama_init_from_file(@params.ModelPath, lparams); - - if (ctx_ptr == IntPtr.Zero) - { - throw new RuntimeError($"Failed to load model {@params.ModelPath}."); - } - - SafeLLamaContextHandle ctx = new(ctx_ptr); + var model = SafeLlamaModelHandle.LoadFromFile(@params.ModelPath, lparams); + var ctx = SafeLLamaContextHandle.Create(model, lparams); if (!string.IsNullOrEmpty(@params.LoraAdapter)) { - int err = NativeApi.llama_apply_lora_from_file(ctx, @params.LoraAdapter, - string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase, @params.Threads); + var err = NativeApi.llama_model_apply_lora_from_file( + model, + @params.LoraAdapter, + string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase, + @params.Threads + ); + if (err != 0) { throw new RuntimeError("Failed to apply lora adapter."); @@ -78,7 +77,7 @@ namespace LLama return res.Take(n); } - public unsafe static Span GetLogits(SafeLLamaContextHandle ctx, int length) + public static unsafe Span GetLogits(SafeLLamaContextHandle ctx, int length) { var logits = NativeApi.llama_get_logits(ctx); return new Span(logits, length); From c95b14d8b3f0cc182c210f72e355223499e3f325 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 16:23:25 +0100 Subject: [PATCH 02/11] - Fixed null check - Additional comments --- LLama/Native/SafeLLamaHandleBase.cs | 10 +++++++--- LLama/Native/SafeLlamaModelHandle.cs | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/LLama/Native/SafeLLamaHandleBase.cs b/LLama/Native/SafeLLamaHandleBase.cs index 023f8cdd..6371b327 100644 --- a/LLama/Native/SafeLLamaHandleBase.cs +++ b/LLama/Native/SafeLLamaHandleBase.cs @@ -1,11 +1,13 @@ using System; -using System.Collections.Generic; using System.Runtime.InteropServices; -using System.Text; namespace LLama.Native { - public abstract class SafeLLamaHandleBase: SafeHandle + /// + /// Base class for all llama handles to native resources + /// + public abstract class SafeLLamaHandleBase + : SafeHandle { private protected SafeLLamaHandleBase() : base(IntPtr.Zero, ownsHandle: true) @@ -24,8 +26,10 @@ namespace LLama.Native SetHandle(handle); } + /// public override bool IsInvalid => handle == IntPtr.Zero; + /// public override string ToString() => $"0x{handle.ToString("x16")}"; } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 7448efa1..5607ccee 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -3,10 +3,13 @@ using LLama.Exceptions; namespace LLama.Native { + /// + /// A reference to a set of llama model weights + /// public class SafeLlamaModelHandle : SafeLLamaHandleBase { - public SafeLlamaModelHandle(IntPtr handle) + internal SafeLlamaModelHandle(IntPtr handle) : base(handle) { } @@ -19,10 +22,17 @@ namespace LLama.Native return true; } + /// + /// Load a model from the given file path into memory + /// + /// + /// + /// + /// public static SafeLlamaModelHandle LoadFromFile(string modelPath, LLamaContextParams lparams) { var model_ptr = NativeApi.llama_load_model_from_file(modelPath, lparams); - if (model_ptr == null) + if (model_ptr == IntPtr.Zero) throw new RuntimeError($"Failed to load model {modelPath}."); return new SafeLlamaModelHandle(model_ptr); From 44b1e93609b6bc21a97e5fffa947c64cc690a3e2 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 16:35:24 +0100 Subject: [PATCH 03/11] Moved LoRA loading into `SafeLlamaModelHandle` --- LLama/Native/SafeLlamaModelHandle.cs | 21 +++++++++++++++++++++ LLama/OldVersion/Utils.cs | 10 ++-------- LLama/Utils.cs | 13 +------------ 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 5607ccee..4e3b8044 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -37,5 +37,26 @@ namespace LLama.Native return new SafeLlamaModelHandle(model_ptr); } + + /// + /// Apply a LoRA adapter to a loaded model + /// + /// + /// A path to a higher quality model to use as a base for the layers modified by the + /// adapter. Can be NULL to use the current loaded model. + /// + /// + public void ApplyLoraFromFile(string lora, string? modelBase = null, int threads = -1) + { + var err = NativeApi.llama_model_apply_lora_from_file( + this, + lora, + string.IsNullOrEmpty(modelBase) ? null : modelBase, + threads + ); + + if (err != 0) + throw new RuntimeError("Failed to apply lora adapter."); + } } } diff --git a/LLama/OldVersion/Utils.cs b/LLama/OldVersion/Utils.cs index 646ce365..df8adddd 100644 --- a/LLama/OldVersion/Utils.cs +++ b/LLama/OldVersion/Utils.cs @@ -35,14 +35,8 @@ namespace LLama.OldVersion var ctx = SafeLLamaContextHandle.Create(model, lparams); if (!string.IsNullOrEmpty(@params.lora_adapter)) - { - int err = NativeApi.llama_model_apply_lora_from_file(model, @params.lora_adapter, - string.IsNullOrEmpty(@params.lora_base) ? null : @params.lora_base, @params.n_threads); - if (err != 0) - { - throw new RuntimeError("Failed to apply lora adapter."); - } - } + model.ApplyLoraFromFile(@params.lora_adapter, @params.lora_base, @params.n_threads); + return ctx; } diff --git a/LLama/Utils.cs b/LLama/Utils.cs index 8ee084ec..c2dbf7aa 100644 --- a/LLama/Utils.cs +++ b/LLama/Utils.cs @@ -48,19 +48,8 @@ namespace LLama var ctx = SafeLLamaContextHandle.Create(model, lparams); if (!string.IsNullOrEmpty(@params.LoraAdapter)) - { - var err = NativeApi.llama_model_apply_lora_from_file( - model, - @params.LoraAdapter, - string.IsNullOrEmpty(@params.LoraBase) ? null : @params.LoraBase, - @params.Threads - ); + model.ApplyLoraFromFile(@params.LoraAdapter, @params.LoraBase, @params.Threads); - if (err != 0) - { - throw new RuntimeError("Failed to apply lora adapter."); - } - } return ctx; } From b721072aa5318a81f2ddf1f924760d92f650fe55 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 16:41:17 +0100 Subject: [PATCH 04/11] Exposed some extra model properties on safe handle --- LLama/Native/NativeApi.cs | 9 +++++++++ LLama/Native/SafeLlamaModelHandle.cs | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 629fe3f6..5218d55c 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -303,5 +303,14 @@ namespace LLama.Native /// [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern IntPtr llama_print_system_info(); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_n_vocab_from_model(SafeLlamaModelHandle model); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_n_ctx_from_model(SafeLlamaModelHandle model); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model); } } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 4e3b8044..e047c8fe 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -9,9 +9,21 @@ namespace LLama.Native public class SafeLlamaModelHandle : SafeLLamaHandleBase { + /// + /// Total number of tokens in vocabulary of this model + /// + public int VocabCount { get; set; } + + public int ContextSize { get; set; } + + public int EmbeddingCount { get; set; } + internal SafeLlamaModelHandle(IntPtr handle) : base(handle) { + VocabCount = NativeApi.llama_n_vocab_from_model(this); + ContextSize = NativeApi.llama_n_ctx_from_model(this); + EmbeddingCount = NativeApi.llama_n_embd_from_model(this); } /// From 369c915afe5fa3ae048023cbc0f36a37c945dfe2 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 16:55:04 +0100 Subject: [PATCH 05/11] Added TokenToString conversion on model handle --- LLama/Native/NativeApi.cs | 3 +++ LLama/Native/SafeLlamaModelHandle.cs | 39 ++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 5218d55c..ed6b0e5a 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -312,5 +312,8 @@ namespace LLama.Native [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle safeLlamaModelHandle, int llamaToken); } } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index e047c8fe..939fc57d 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -1,4 +1,6 @@ using System; +using System.Drawing; +using System.Text; using LLama.Exceptions; namespace LLama.Native @@ -70,5 +72,42 @@ namespace LLama.Native if (err != 0) throw new RuntimeError("Failed to apply lora adapter."); } + + /// + /// Convert a single llama token into string bytes + /// + /// + /// + public ReadOnlySpan TokenToSpan(int llama_token) + { + unsafe + { + var bytes = new ReadOnlySpan(NativeApi.llama_token_to_str_with_model(this, llama_token), int.MaxValue); + var terminator = bytes.IndexOf((byte)0); + return bytes.Slice(0, terminator); + } + } + + /// + /// Convert a single llama token into a string + /// + /// + /// Encoding to use to decode the bytes into a string + /// + public string TokenToString(int llama_token, Encoding encoding) + { + var span = TokenToSpan(llama_token); + + if (span.Length == 0) + return ""; + + unsafe + { + fixed (byte* ptr = &span[0]) + { + return encoding.GetString(ptr, span.Length); + } + } + } } } From afb9d24f3abee38932afdf2609625cad8812077f Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 20:29:35 +0100 Subject: [PATCH 06/11] Added model `Tokenize` method --- LLama/Native/NativeApi.cs | 5 +++- LLama/Native/SafeLlamaModelHandle.cs | 43 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index ed6b0e5a..527bea52 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -314,6 +314,9 @@ namespace LLama.Native public static extern int llama_n_embd_from_model(SafeLlamaModelHandle model); [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle safeLlamaModelHandle, int llamaToken); + public static extern byte* llama_token_to_str_with_model(SafeLlamaModelHandle model, int llamaToken); + + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern int llama_tokenize_with_model(SafeLlamaModelHandle model, byte* text, int* tokens, int n_max_tokens, bool add_bos); } } diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 939fc57d..d1e6c230 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -52,6 +52,7 @@ namespace LLama.Native return new SafeLlamaModelHandle(model_ptr); } + #region LoRA /// /// Apply a LoRA adapter to a loaded model /// @@ -72,7 +73,9 @@ namespace LLama.Native if (err != 0) throw new RuntimeError("Failed to apply lora adapter."); } + #endregion + #region tokenize /// /// Convert a single llama token into string bytes /// @@ -109,5 +112,45 @@ namespace LLama.Native } } } + + /// + /// Convert a string of text into tokens + /// + /// + /// + /// + /// + public int[] Tokenize(string text, bool add_bos, Encoding encoding) + { + // Convert string to bytes, adding one extra byte to the end (null terminator) + var bytesCount = encoding.GetByteCount(text); + var bytes = new byte[bytesCount + 1]; + unsafe + { + fixed (char* charPtr = text) + fixed (byte* bytePtr = &bytes[0]) + { + encoding.GetBytes(charPtr, text.Length, bytePtr, bytes.Length); + } + } + + unsafe + { + fixed (byte* bytesPtr = &bytes[0]) + { + // Tokenize once with no output, to get the token count. Output will be negative (indicating that there was insufficient space) + var count = -NativeApi.llama_tokenize_with_model(this, bytesPtr, (int*)IntPtr.Zero, 0, add_bos); + + // Tokenize again, this time outputting into an array of exactly the right size + var tokens = new int[count]; + fixed (int* tokensPtr = &tokens[0]) + { + count = NativeApi.llama_tokenize_with_model(this, bytesPtr, tokensPtr, count, add_bos); + return tokens; + } + } + } + } + #endregion } } From c974c8429e3bf71deeaf1446fbd7e6737af1952a Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 25 Jul 2023 20:30:10 +0100 Subject: [PATCH 07/11] Removed leftover `using` --- LLama/Native/SafeLlamaModelHandle.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index d1e6c230..64ba9073 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -1,5 +1,4 @@ using System; -using System.Drawing; using System.Text; using LLama.Exceptions; From 6985d3ab60264c2af14258b60db350f82028e94b Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 27 Jul 2023 18:58:29 +0100 Subject: [PATCH 08/11] Added comments on two properties --- LLama/Native/SafeLlamaModelHandle.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 64ba9073..79714fea 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -15,8 +15,14 @@ namespace LLama.Native /// public int VocabCount { get; set; } + /// + /// Total number of tokens in the context + /// public int ContextSize { get; set; } + /// + /// Dimension of embedding vectors + /// public int EmbeddingCount { get; set; } internal SafeLlamaModelHandle(IntPtr handle) From 3e252c81f66ca326574d5a795301732ef6bba97e Mon Sep 17 00:00:00 2001 From: sa_ddam213 Date: Fri, 28 Jul 2023 19:15:19 +1200 Subject: [PATCH 09/11] LLamaContextParams epsilon and tensor split changes --- LLama/Native/LLamaContextParams.cs | 8 ++++---- LLama/Utils.cs | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 6412409e..42f2be3f 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -32,7 +32,7 @@ namespace LLama.Native /// /// rms norm epsilon (TEMP - will be moved to model hparams) /// - float rms_norm_eps; + public float rms_norm_eps; /// /// number of layers to store in VRAM @@ -47,19 +47,19 @@ namespace LLama.Native /// /// how to split layers across multiple GPUs /// - public TensorSplits tensor_split; + public float[] tensor_split; /// /// ref: https://github.com/ggerganov/llama.cpp/pull/2054 /// RoPE base frequency /// - float rope_freq_base; + public float rope_freq_base; /// /// ref: https://github.com/ggerganov/llama.cpp/pull/2054 /// RoPE frequency scaling factor /// - float rope_freq_scale; + public float rope_freq_scale; /// /// called with a progress value between 0 and 1, pass NULL to disable diff --git a/LLama/Utils.cs b/LLama/Utils.cs index c2dbf7aa..c08912cf 100644 --- a/LLama/Utils.cs +++ b/LLama/Utils.cs @@ -28,16 +28,13 @@ namespace LLama lparams.logits_all = @params.Perplexity; lparams.embedding = @params.EmbeddingMode; lparams.low_vram = @params.LowVram; - - if(@params.TensorSplits.Length != 1) + + if (@params.TensorSplits.Length != 1) { throw new ArgumentException("Currently multi-gpu support is not supported by " + "both llama.cpp and LLamaSharp."); } - lparams.tensor_split = new TensorSplits() - { - Item1 = @params.TensorSplits[0] - }; + lparams.tensor_split = @params.TensorSplits; if (!File.Exists(@params.ModelPath)) { From 2245b8490661156a02790f93f5b971a0565db7ce Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Wed, 2 Aug 2023 23:13:07 +0100 Subject: [PATCH 10/11] Update LLamaContextParams.cs --- LLama/Native/LLamaContextParams.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 42f2be3f..58233ba5 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -47,6 +47,7 @@ namespace LLama.Native /// /// how to split layers across multiple GPUs /// + [MarshalAs(UnmanagedType.LPArray)] public float[] tensor_split; /// From add3d5528b7bdda66617af6d6545f9e5272c6615 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Thu, 3 Aug 2023 14:16:41 +0100 Subject: [PATCH 11/11] Removed `MarshalAs` on array --- LLama/Native/LLamaContextParams.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index 58233ba5..42f2be3f 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -47,7 +47,6 @@ namespace LLama.Native /// /// how to split layers across multiple GPUs /// - [MarshalAs(UnmanagedType.LPArray)] public float[] tensor_split; ///