- build run: https://github.com/SciSharp/LLamaSharp/actions/runs/7196891440
- commit: 9fb13f9584
tags/0.9.1
| @@ -17,106 +17,52 @@ namespace LLama.Web.Common | |||
| /// </summary> | |||
| public int MaxInstances { get; set; } | |||
| /// <summary> | |||
| /// Model context size (n_ctx) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public uint? ContextSize { get; set; } | |||
| /// <summary> | |||
| /// the GPU that is used for scratch and small tensors | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public int MainGpu { get; set; } = 0; | |||
| /// <summary> | |||
| /// if true, reduce VRAM usage at the cost of performance | |||
| /// </summary> | |||
| public bool LowVram { get; set; } = false; | |||
| /// <summary> | |||
| /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public int GpuLayerCount { get; set; } = 20; | |||
| /// <summary> | |||
| /// Seed for the random number generator (seed) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public uint Seed { get; set; } = 1686349486; | |||
| /// <summary> | |||
| /// Use f16 instead of f32 for memory kv (memory_f16) | |||
| /// </summary> | |||
| public bool UseFp16Memory { get; set; } = true; | |||
| /// <summary> | |||
| /// Use mmap for faster loads (use_mmap) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public bool UseMemorymap { get; set; } = true; | |||
| /// <summary> | |||
| /// Use mlock to keep model in memory (use_mlock) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public bool UseMemoryLock { get; set; } = false; | |||
| /// <summary> | |||
| /// Compute perplexity over the prompt (perplexity) | |||
| /// </summary> | |||
| public bool Perplexity { get; set; } = false; | |||
| /// <summary> | |||
| /// Model path (model) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public string ModelPath { get; set; } | |||
| /// <summary> | |||
| /// List of LoRAs to apply | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public AdapterCollection LoraAdapters { get; set; } = new(); | |||
| /// <summary> | |||
| /// base model path for the lora adapter (lora_base) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public string LoraBase { get; set; } = string.Empty; | |||
| /// <summary> | |||
| /// Number of threads (null = autodetect) (n_threads) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public uint? Threads { get; set; } | |||
| /// <summary> | |||
| /// Number of threads to use for batch processing (null = autodetect) (n_threads) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public uint? BatchThreads { get; set; } | |||
| /// <summary> | |||
| /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public uint BatchSize { get; set; } = 512; | |||
| /// <summary> | |||
| /// Whether to convert eos to newline during the inference. | |||
| /// </summary> | |||
| public bool ConvertEosToNewLine { get; set; } = false; | |||
| /// <summary> | |||
| /// Whether to use embedding mode. (embedding) Note that if this is set to true, | |||
| /// The LLamaModel won't produce text response anymore. | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public bool EmbeddingMode { get; set; } = false; | |||
| /// <summary> | |||
| /// how split tensors should be distributed across GPUs | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public TensorSplitsCollection TensorSplits { get; set; } = new(); | |||
| /// <summary> | |||
| /// RoPE base frequency | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public float? RopeFrequencyBase { get; set; } | |||
| /// <summary> | |||
| /// RoPE frequency scaling factor | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public float? RopeFrequencyScale { get; set; } | |||
| /// <inheritdoc /> | |||
| @@ -137,19 +83,19 @@ namespace LLama.Web.Common | |||
| /// <inheritdoc /> | |||
| public RopeScalingType? YarnScalingType { get; set; } | |||
| /// <summary> | |||
| /// Use experimental mul_mat_q kernels | |||
| /// </summary> | |||
| public bool MulMatQ { get; set; } | |||
| /// <inheritdoc /> | |||
| public GGMLType? TypeK { get; set; } | |||
| /// <summary> | |||
| /// The encoding to use for models | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public GGMLType? TypeV { get; set; } | |||
| /// <inheritdoc /> | |||
| public bool NoKqvOffload { get; set; } | |||
| /// <inheritdoc /> | |||
| public Encoding Encoding { get; set; } = Encoding.UTF8; | |||
| /// <summary> | |||
| /// Load vocab only (no weights) | |||
| /// </summary> | |||
| /// <inheritdoc /> | |||
| public bool VocabOnly { get; set; } | |||
| } | |||
| } | |||
| @@ -23,16 +23,6 @@ public interface IContextParams | |||
| /// </summary> | |||
| uint Seed { get; set; } | |||
| /// <summary> | |||
| /// Use f16 instead of f32 for memory kv (memory_f16) | |||
| /// </summary> | |||
| bool UseFp16Memory { get; set; } | |||
| /// <summary> | |||
| /// Compute perplexity over the prompt (perplexity) | |||
| /// </summary> | |||
| bool Perplexity { get; set; } | |||
| /// <summary> | |||
| /// Whether to use embedding mode. (embedding) Note that if this is set to true, | |||
| /// The LLamaModel won't produce text response anymore. | |||
| @@ -49,11 +39,6 @@ public interface IContextParams | |||
| /// </summary> | |||
| float? RopeFrequencyScale { get; set; } | |||
| /// <summary> | |||
| /// Use experimental mul_mat_q kernels | |||
| /// </summary> | |||
| bool MulMatQ { get; set; } | |||
| /// <summary> | |||
| /// The encoding to use for models | |||
| /// </summary> | |||
| @@ -70,27 +55,27 @@ public interface IContextParams | |||
| uint? BatchThreads { get; set; } | |||
| /// <summary> | |||
| /// YaRN extrapolation mix factor | |||
| /// YaRN extrapolation mix factor (null = from model) | |||
| /// </summary> | |||
| float? YarnExtrapolationFactor { get; set; } | |||
| /// <summary> | |||
| /// YaRN magnitude scaling factor | |||
| /// YaRN magnitude scaling factor (null = from model) | |||
| /// </summary> | |||
| float? YarnAttentionFactor { get; set; } | |||
| /// <summary> | |||
| /// YaRN low correction dim | |||
| /// YaRN low correction dim (null = from model) | |||
| /// </summary> | |||
| float? YarnBetaFast { get; set; } | |||
| /// <summary> | |||
| /// YaRN high correction dim | |||
| /// YaRN high correction dim (null = from model) | |||
| /// </summary> | |||
| float? YarnBetaSlow { get; set; } | |||
| /// <summary> | |||
| /// YaRN original context length | |||
| /// YaRN original context length (null = from model) | |||
| /// </summary> | |||
| uint? YarnOriginalContext { get; set; } | |||
| @@ -98,4 +83,19 @@ public interface IContextParams | |||
| /// YaRN scaling method to use. | |||
| /// </summary> | |||
| RopeScalingType? YarnScalingType { get; set; } | |||
| /// <summary> | |||
| /// Override the type of the K cache | |||
| /// </summary> | |||
| GGMLType? TypeK { get; set; } | |||
| /// <summary> | |||
| /// Override the type of the V cache | |||
| /// </summary> | |||
| GGMLType? TypeV { get; set; } | |||
| /// <summary> | |||
| /// Whether to disable offloading the KQV cache to the GPU | |||
| /// </summary> | |||
| bool NoKqvOffload { get; set; } | |||
| } | |||
| @@ -25,18 +25,12 @@ namespace LLama.Common | |||
| /// <inheritdoc /> | |||
| public uint Seed { get; set; } = 0xFFFFFFFF; | |||
| /// <inheritdoc /> | |||
| public bool UseFp16Memory { get; set; } = true; | |||
| /// <inheritdoc /> | |||
| public bool UseMemorymap { get; set; } = true; | |||
| /// <inheritdoc /> | |||
| public bool UseMemoryLock { get; set; } | |||
| /// <inheritdoc /> | |||
| public bool Perplexity { get; set; } | |||
| /// <inheritdoc /> | |||
| public string ModelPath { get; set; } | |||
| @@ -86,7 +80,13 @@ namespace LLama.Common | |||
| public RopeScalingType? YarnScalingType { get; set; } | |||
| /// <inheritdoc /> | |||
| public bool MulMatQ { get; set; } | |||
| public GGMLType? TypeK { get; set; } | |||
| /// <inheritdoc /> | |||
| public GGMLType? TypeV { get; set; } | |||
| /// <inheritdoc /> | |||
| public bool NoKqvOffload { get; set; } | |||
| /// <inheritdoc /> | |||
| public bool VocabOnly { get; set; } | |||
| @@ -24,8 +24,6 @@ namespace LLama.Extensions | |||
| result.n_ctx = @params.ContextSize ?? 0; | |||
| result.n_batch = @params.BatchSize; | |||
| result.seed = @params.Seed; | |||
| result.f16_kv = @params.UseFp16Memory; | |||
| result.logits_all = @params.Perplexity; | |||
| result.embedding = @params.EmbeddingMode; | |||
| result.rope_freq_base = @params.RopeFrequencyBase ?? 0; | |||
| result.rope_freq_scale = @params.RopeFrequencyScale ?? 0; | |||
| @@ -38,7 +36,9 @@ namespace LLama.Extensions | |||
| result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0; | |||
| result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED; | |||
| result.mul_mat_q = @params.MulMatQ; | |||
| result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16; | |||
| result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16; | |||
| result.offload_kqv = !@params.NoKqvOffload; | |||
| result.n_threads = Threads(@params.Threads); | |||
| result.n_threads_batch = Threads(@params.BatchThreads); | |||
| @@ -0,0 +1,30 @@ | |||
| namespace LLama.Native; | |||
| public enum GGMLType | |||
| { | |||
| GGML_TYPE_F32 = 0, | |||
| GGML_TYPE_F16 = 1, | |||
| GGML_TYPE_Q4_0 = 2, | |||
| GGML_TYPE_Q4_1 = 3, | |||
| // GGML_TYPE_Q4_2 = 4, support has been removed | |||
| // GGML_TYPE_Q4_3 (5) support has been removed | |||
| GGML_TYPE_Q5_0 = 6, | |||
| GGML_TYPE_Q5_1 = 7, | |||
| GGML_TYPE_Q8_0 = 8, | |||
| GGML_TYPE_Q8_1 = 9, | |||
| // k-quantizations | |||
| GGML_TYPE_Q2_K = 10, | |||
| GGML_TYPE_Q3_K = 11, | |||
| GGML_TYPE_Q4_K = 12, | |||
| GGML_TYPE_Q5_K = 13, | |||
| GGML_TYPE_Q6_K = 14, | |||
| GGML_TYPE_Q8_K = 15, | |||
| GGML_TYPE_I8 = 16, | |||
| GGML_TYPE_I16 = 17, | |||
| GGML_TYPE_I32 = 18, | |||
| GGML_TYPE_COUNT, | |||
| } | |||
| @@ -56,7 +56,7 @@ namespace LLama.Native | |||
| /// </summary> | |||
| public float rope_freq_scale; | |||
| /// <summary> | |||
| /// YaRN extrapolation mix factor, NaN = from model | |||
| /// YaRN extrapolation mix factor, negative = from model | |||
| /// </summary> | |||
| public float yarn_ext_factor; | |||
| /// <summary> | |||
| @@ -75,36 +75,26 @@ namespace LLama.Native | |||
| /// <summary> | |||
| /// YaRN original context size | |||
| /// </summary> | |||
| public uint yarn_orig_ctx; | |||
| public uint yarn_orig_ctx; | |||
| /// <summary> | |||
| /// if true, use experimental mul_mat_q kernels | |||
| /// data type for K cache | |||
| /// </summary> | |||
| public bool mul_mat_q | |||
| { | |||
| readonly get => Convert.ToBoolean(_mul_mat_q); | |||
| set => _mul_mat_q = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _mul_mat_q; | |||
| public GGMLType type_k; | |||
| /// <summary> | |||
| /// use fp16 for KV cache | |||
| /// data type for V cache | |||
| /// </summary> | |||
| public bool f16_kv | |||
| { | |||
| readonly get => Convert.ToBoolean(_f16_kv); | |||
| set => _f16_kv = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _f16_kv; | |||
| public GGMLType type_v; | |||
| /// <summary> | |||
| /// the llama_eval() call computes all logits, not just the last one | |||
| /// Deprecated! | |||
| /// </summary> | |||
| private sbyte _mul_mat_q; | |||
| /// <summary> | |||
| /// Deprecated! | |||
| /// </summary> | |||
| public bool logits_all | |||
| { | |||
| readonly get => Convert.ToBoolean(_logits_all); | |||
| set => _logits_all = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _logits_all; | |||
| /// <summary> | |||
| @@ -116,6 +106,16 @@ namespace LLama.Native | |||
| set => _embedding = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _embedding; | |||
| /// <summary> | |||
| /// whether to offload the KQV ops (including the KV cache) to GPU | |||
| /// </summary> | |||
| public bool offload_kqv | |||
| { | |||
| readonly get => Convert.ToBoolean(_offload_kqv); | |||
| set => _offload_kqv = Convert.ToSByte(value); | |||
| } | |||
| private sbyte _offload_kqv; | |||
| } | |||
| } | |||
| @@ -0,0 +1,97 @@ | |||
| using System.Runtime.InteropServices; | |||
| namespace LLama.Native; | |||
| /// <summary> | |||
| /// Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell) | |||
| /// </summary> | |||
| [StructLayout(LayoutKind.Sequential)] | |||
| public struct LLamaKvCacheViewCell | |||
| { | |||
| /// <summary> | |||
| /// The position for this cell. Takes KV cache shifts into account. | |||
| /// May be negative if the cell is not populated. | |||
| /// </summary> | |||
| public LLamaPos pos; | |||
| }; | |||
| /// <summary> | |||
| /// An updateable view of the KV cache (llama_kv_cache_view) | |||
| /// </summary> | |||
| //todo: rewrite to safe handle? | |||
| [StructLayout(LayoutKind.Sequential)] | |||
| public unsafe struct LLamaKvCacheView | |||
| { | |||
| // Number of KV cache cells. This will be the same as the context size. | |||
| int n_cells; | |||
| // Maximum number of sequences that can exist in a cell. It's not an error | |||
| // if there are more sequences in a cell than this value, however they will | |||
| // not be visible in the view cells_sequences. | |||
| int n_max_seq; | |||
| // Number of tokens in the cache. For example, if there are two populated | |||
| // cells, the first with 1 sequence id in it and the second with 2 sequence | |||
| // ids then you'll have 3 tokens. | |||
| int token_count; | |||
| // Number of populated cache cells. | |||
| int used_cells; | |||
| // Maximum contiguous empty slots in the cache. | |||
| int max_contiguous; | |||
| // Index to the start of the max_contiguous slot range. Can be negative | |||
| // when cache is full. | |||
| int max_contiguous_idx; | |||
| // Information for an individual cell. | |||
| LLamaKvCacheViewCell* cells; | |||
| // The sequences for each cell. There will be n_max_seq items per cell. | |||
| LLamaSeqId* cells_sequences; | |||
| } | |||
| partial class NativeApi | |||
| { | |||
| /// <summary> | |||
| /// Create an empty KV cache view. (use only for debugging purposes) | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <param name="n_max_seq"></param> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern LLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_max_seq); | |||
| /// <summary> | |||
| /// Free a KV cache view. (use only for debugging purposes) | |||
| /// </summary> | |||
| /// <param name="view"></param> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe void llama_kv_cache_view_free(LLamaKvCacheView* view); | |||
| /// <summary> | |||
| /// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes) | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <param name="view"></param> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern unsafe void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, LLamaKvCacheView* view); | |||
| /// <summary> | |||
| /// Returns the number of tokens in the KV cache (slow, use only for debug) | |||
| /// If a KV cell has multiple sequences assigned to it, it will be counted multiple times | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern int llama_get_kv_cache_token_count(SafeLLamaContextHandle ctx); | |||
| /// <summary> | |||
| /// Returns the number of used KV cells (i.e. have at least one sequence assigned to them) | |||
| /// </summary> | |||
| /// <param name="ctx"></param> | |||
| /// <returns></returns> | |||
| [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] | |||
| public static extern int llama_get_kv_cache_used_cells(SafeLLamaContextHandle ctx); | |||
| } | |||
| @@ -0,0 +1,61 @@ | |||
| using System.Runtime.InteropServices; | |||
| namespace LLama.Native; | |||
| /// <summary> | |||
| /// Override a key/value pair in the llama model metadata | |||
| /// </summary> | |||
| [StructLayout(LayoutKind.Explicit)] | |||
| public unsafe struct LLamaModelKvOverride | |||
| { | |||
| /// <summary> | |||
| /// Key to override | |||
| /// </summary> | |||
| [FieldOffset(0)] | |||
| public fixed char key[128]; | |||
| /// <summary> | |||
| /// Type of value | |||
| /// </summary> | |||
| [FieldOffset(128)] | |||
| public LLamaModelKvOverrideType Tag; | |||
| /// <summary> | |||
| /// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_INT | |||
| /// </summary> | |||
| [FieldOffset(132)] | |||
| public long IntValue; | |||
| /// <summary> | |||
| /// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_FLOAT | |||
| /// </summary> | |||
| [FieldOffset(132)] | |||
| public double FloatValue; | |||
| /// <summary> | |||
| /// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_BOOL | |||
| /// </summary> | |||
| [FieldOffset(132)] | |||
| public int BoolValue; | |||
| } | |||
| /// <summary> | |||
| /// Specifies what type of value is being overridden by LLamaModelKvOverride | |||
| /// </summary> | |||
| public enum LLamaModelKvOverrideType | |||
| { | |||
| /// <summary> | |||
| /// Overriding an int value | |||
| /// </summary> | |||
| LLAMA_KV_OVERRIDE_INT = 0, | |||
| /// <summary> | |||
| /// Overriding a float value | |||
| /// </summary> | |||
| LLAMA_KV_OVERRIDE_FLOAT = 1, | |||
| /// <summary> | |||
| /// Overriding a bool value | |||
| /// </summary> | |||
| LLAMA_KV_OVERRIDE_BOOL = 2, | |||
| } | |||
| @@ -34,6 +34,11 @@ namespace LLama.Native | |||
| /// </summary> | |||
| public void* progress_callback_user_data; | |||
| /// <summary> | |||
| /// override key-value pairs of the model meta data | |||
| /// </summary> | |||
| public LLamaModelKvOverride* kv_overrides; | |||
| /// <summary> | |||
| /// only load the vocabulary, no weights | |||
| /// </summary> | |||