Martin Evans 2 years ago
parent
commit
439d14a061
24 changed files with 2429 additions and 517 deletions
  1. +26
    -80
      LLama.Web/Common/ModelOptions.cs
  2. +20
    -20
      LLama/Abstractions/IContextParams.cs
  3. +7
    -7
      LLama/Common/ModelParams.cs
  4. +3
    -3
      LLama/Extensions/IContextParamsExtensions.cs
  5. +30
    -0
      LLama/Native/GGMLType.cs
  6. +23
    -23
      LLama/Native/LLamaContextParams.cs
  7. +97
    -0
      LLama/Native/LLamaKvCacheView.cs
  8. +61
    -0
      LLama/Native/LLamaModelKvOverride.cs
  9. +5
    -0
      LLama/Native/LLamaModelParams.cs
  10. BIN
      LLama/runtimes/deps/avx/libllama.dll
  11. BIN
      LLama/runtimes/deps/avx/libllama.so
  12. BIN
      LLama/runtimes/deps/avx2/libllama.dll
  13. BIN
      LLama/runtimes/deps/avx2/libllama.so
  14. BIN
      LLama/runtimes/deps/avx512/libllama.dll
  15. BIN
      LLama/runtimes/deps/avx512/libllama.so
  16. BIN
      LLama/runtimes/deps/cu11.7.1/libllama.dll
  17. BIN
      LLama/runtimes/deps/cu11.7.1/libllama.so
  18. BIN
      LLama/runtimes/deps/cu12.1.0/libllama.dll
  19. BIN
      LLama/runtimes/deps/cu12.1.0/libllama.so
  20. BIN
      LLama/runtimes/deps/libllama.dll
  21. BIN
      LLama/runtimes/deps/libllama.so
  22. +2157
    -384
      LLama/runtimes/deps/osx-arm64/ggml-metal.metal
  23. BIN
      LLama/runtimes/deps/osx-arm64/libllama.dylib
  24. BIN
      LLama/runtimes/deps/osx-x64/libllama.dylib

+ 26
- 80
LLama.Web/Common/ModelOptions.cs View File

@@ -17,106 +17,52 @@ namespace LLama.Web.Common
/// </summary>
public int MaxInstances { get; set; }

/// <summary>
/// Model context size (n_ctx)
/// </summary>
/// <inheritdoc />
public uint? ContextSize { get; set; }

/// <summary>
/// the GPU that is used for scratch and small tensors
/// </summary>
/// <inheritdoc />
public int MainGpu { get; set; } = 0;

/// <summary>
/// if true, reduce VRAM usage at the cost of performance
/// </summary>
public bool LowVram { get; set; } = false;

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
/// </summary>
/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;

/// <summary>
/// Seed for the random number generator (seed)
/// </summary>
/// <inheritdoc />
public uint Seed { get; set; } = 1686349486;

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
public bool UseFp16Memory { get; set; } = true;

/// <summary>
/// Use mmap for faster loads (use_mmap)
/// </summary>
/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

/// <summary>
/// Use mlock to keep model in memory (use_mlock)
/// </summary>
/// <inheritdoc />
public bool UseMemoryLock { get; set; } = false;

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
public bool Perplexity { get; set; } = false;

/// <summary>
/// Model path (model)
/// </summary>
/// <inheritdoc />
public string ModelPath { get; set; }

/// <summary>
/// List of LoRAs to apply
/// </summary>
/// <inheritdoc />
public AdapterCollection LoraAdapters { get; set; } = new();

/// <summary>

/// base model path for the lora adapter (lora_base)
/// </summary>
/// <inheritdoc />
public string LoraBase { get; set; } = string.Empty;

/// <summary>
/// Number of threads (null = autodetect) (n_threads)
/// </summary>
/// <inheritdoc />
public uint? Threads { get; set; }

/// <summary>
/// Number of threads to use for batch processing (null = autodetect) (n_threads)
/// </summary>
/// <inheritdoc />
public uint? BatchThreads { get; set; }

/// <summary>
/// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
/// </summary>
/// <inheritdoc />
public uint BatchSize { get; set; } = 512;

/// <summary>
/// Whether to convert eos to newline during the inference.
/// </summary>
public bool ConvertEosToNewLine { get; set; } = false;

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
/// </summary>
/// <inheritdoc />
public bool EmbeddingMode { get; set; } = false;

/// <summary>
/// how split tensors should be distributed across GPUs
/// </summary>
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <summary>
/// RoPE base frequency
/// </summary>
/// <inheritdoc />
public float? RopeFrequencyBase { get; set; }

/// <summary>
/// RoPE frequency scaling factor
/// </summary>
/// <inheritdoc />
public float? RopeFrequencyScale { get; set; }

/// <inheritdoc />
@@ -137,19 +83,19 @@ namespace LLama.Web.Common
/// <inheritdoc />
public RopeScalingType? YarnScalingType { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
public bool MulMatQ { get; set; }
/// <inheritdoc />
public GGMLType? TypeK { get; set; }

/// <summary>
/// The encoding to use for models
/// </summary>
/// <inheritdoc />
public GGMLType? TypeV { get; set; }

/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public Encoding Encoding { get; set; } = Encoding.UTF8;

/// <summary>
/// Load vocab only (no weights)
/// </summary>
/// <inheritdoc />
public bool VocabOnly { get; set; }
}
}

+ 20
- 20
LLama/Abstractions/IContextParams.cs View File

@@ -23,16 +23,6 @@ public interface IContextParams
/// </summary>
uint Seed { get; set; }

/// <summary>
/// Use f16 instead of f32 for memory kv (memory_f16)
/// </summary>
bool UseFp16Memory { get; set; }

/// <summary>
/// Compute perplexity over the prompt (perplexity)
/// </summary>
bool Perplexity { get; set; }

/// <summary>
/// Whether to use embedding mode. (embedding) Note that if this is set to true,
/// The LLamaModel won't produce text response anymore.
@@ -49,11 +39,6 @@ public interface IContextParams
/// </summary>
float? RopeFrequencyScale { get; set; }

/// <summary>
/// Use experimental mul_mat_q kernels
/// </summary>
bool MulMatQ { get; set; }

/// <summary>
/// The encoding to use for models
/// </summary>
@@ -70,27 +55,27 @@ public interface IContextParams
uint? BatchThreads { get; set; }

/// <summary>
/// YaRN extrapolation mix factor
/// YaRN extrapolation mix factor (null = from model)
/// </summary>
float? YarnExtrapolationFactor { get; set; }

/// <summary>
/// YaRN magnitude scaling factor
/// YaRN magnitude scaling factor (null = from model)
/// </summary>
float? YarnAttentionFactor { get; set; }

/// <summary>
/// YaRN low correction dim
/// YaRN low correction dim (null = from model)
/// </summary>
float? YarnBetaFast { get; set; }

/// <summary>
/// YaRN high correction dim
/// YaRN high correction dim (null = from model)
/// </summary>
float? YarnBetaSlow { get; set; }

/// <summary>
/// YaRN original context length
/// YaRN original context length (null = from model)
/// </summary>
uint? YarnOriginalContext { get; set; }

@@ -98,4 +83,19 @@ public interface IContextParams
/// YaRN scaling method to use.
/// </summary>
RopeScalingType? YarnScalingType { get; set; }

/// <summary>
/// Override the type of the K cache
/// </summary>
GGMLType? TypeK { get; set; }

/// <summary>
/// Override the type of the V cache
/// </summary>
GGMLType? TypeV { get; set; }

/// <summary>
/// Whether to disable offloading the KQV cache to the GPU
/// </summary>
bool NoKqvOffload { get; set; }
}

+ 7
- 7
LLama/Common/ModelParams.cs View File

@@ -25,18 +25,12 @@ namespace LLama.Common
/// <inheritdoc />
public uint Seed { get; set; } = 0xFFFFFFFF;

/// <inheritdoc />
public bool UseFp16Memory { get; set; } = true;

/// <inheritdoc />
public bool UseMemorymap { get; set; } = true;

/// <inheritdoc />
public bool UseMemoryLock { get; set; }

/// <inheritdoc />
public bool Perplexity { get; set; }

/// <inheritdoc />
public string ModelPath { get; set; }

@@ -86,7 +80,13 @@ namespace LLama.Common
public RopeScalingType? YarnScalingType { get; set; }

/// <inheritdoc />
public bool MulMatQ { get; set; }
public GGMLType? TypeK { get; set; }

/// <inheritdoc />
public GGMLType? TypeV { get; set; }

/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public bool VocabOnly { get; set; }


+ 3
- 3
LLama/Extensions/IContextParamsExtensions.cs View File

@@ -24,8 +24,6 @@ namespace LLama.Extensions
result.n_ctx = @params.ContextSize ?? 0;
result.n_batch = @params.BatchSize;
result.seed = @params.Seed;
result.f16_kv = @params.UseFp16Memory;
result.logits_all = @params.Perplexity;
result.embedding = @params.EmbeddingMode;
result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
@@ -38,7 +36,9 @@ namespace LLama.Extensions
result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;

result.mul_mat_q = @params.MulMatQ;
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_k = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;

result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);


+ 30
- 0
LLama/Native/GGMLType.cs View File

@@ -0,0 +1,30 @@
namespace LLama.Native;

public enum GGMLType
{
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,

// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 (5) support has been removed

GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,

// k-quantizations
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_I8 = 16,
GGML_TYPE_I16 = 17,
GGML_TYPE_I32 = 18,

GGML_TYPE_COUNT,
}

+ 23
- 23
LLama/Native/LLamaContextParams.cs View File

@@ -56,7 +56,7 @@ namespace LLama.Native
/// </summary>
public float rope_freq_scale;
/// <summary>
/// YaRN extrapolation mix factor, NaN = from model
/// YaRN extrapolation mix factor, negative = from model
/// </summary>
public float yarn_ext_factor;
/// <summary>
@@ -75,36 +75,26 @@ namespace LLama.Native
/// <summary>
/// YaRN original context size
/// </summary>
public uint yarn_orig_ctx;
public uint yarn_orig_ctx;
/// <summary>
/// if true, use experimental mul_mat_q kernels
/// data type for K cache
/// </summary>
public bool mul_mat_q
{
readonly get => Convert.ToBoolean(_mul_mat_q);
set => _mul_mat_q = Convert.ToSByte(value);
}
private sbyte _mul_mat_q;
public GGMLType type_k;

/// <summary>
/// use fp16 for KV cache
/// data type for V cache
/// </summary>
public bool f16_kv
{
readonly get => Convert.ToBoolean(_f16_kv);
set => _f16_kv = Convert.ToSByte(value);
}
private sbyte _f16_kv;
public GGMLType type_v;

/// <summary>
/// the llama_eval() call computes all logits, not just the last one
/// Deprecated!
/// </summary>
private sbyte _mul_mat_q;

/// <summary>
/// Deprecated!
/// </summary>
public bool logits_all
{
readonly get => Convert.ToBoolean(_logits_all);
set => _logits_all = Convert.ToSByte(value);
}
private sbyte _logits_all;

/// <summary>
@@ -116,6 +106,16 @@ namespace LLama.Native
set => _embedding = Convert.ToSByte(value);
}
private sbyte _embedding;

/// <summary>
/// whether to offload the KQV ops (including the KV cache) to GPU
/// </summary>
public bool offload_kqv
{
readonly get => Convert.ToBoolean(_offload_kqv);
set => _offload_kqv = Convert.ToSByte(value);
}
private sbyte _offload_kqv;
}
}


+ 97
- 0
LLama/Native/LLamaKvCacheView.cs View File

@@ -0,0 +1,97 @@
using System.Runtime.InteropServices;

namespace LLama.Native;

/// <summary>
/// Information associated with an individual cell in the KV cache view (llama_kv_cache_view_cell)
/// </summary>
[StructLayout(LayoutKind.Sequential)]
public struct LLamaKvCacheViewCell
{
/// <summary>
/// The position for this cell. Takes KV cache shifts into account.
/// May be negative if the cell is not populated.
/// </summary>
public LLamaPos pos;
};

/// <summary>
/// An updateable view of the KV cache (llama_kv_cache_view)
/// </summary>
//todo: rewrite to safe handle?
[StructLayout(LayoutKind.Sequential)]
public unsafe struct LLamaKvCacheView
{
// Number of KV cache cells. This will be the same as the context size.
int n_cells;

// Maximum number of sequences that can exist in a cell. It's not an error
// if there are more sequences in a cell than this value, however they will
// not be visible in the view cells_sequences.
int n_max_seq;

// Number of tokens in the cache. For example, if there are two populated
// cells, the first with 1 sequence id in it and the second with 2 sequence
// ids then you'll have 3 tokens.
int token_count;

// Number of populated cache cells.
int used_cells;

// Maximum contiguous empty slots in the cache.
int max_contiguous;

// Index to the start of the max_contiguous slot range. Can be negative
// when cache is full.
int max_contiguous_idx;

// Information for an individual cell.
LLamaKvCacheViewCell* cells;

// The sequences for each cell. There will be n_max_seq items per cell.
LLamaSeqId* cells_sequences;
}

partial class NativeApi
{
/// <summary>
/// Create an empty KV cache view. (use only for debugging purposes)
/// </summary>
/// <param name="ctx"></param>
/// <param name="n_max_seq"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern LLamaKvCacheView llama_kv_cache_view_init(SafeLLamaContextHandle ctx, int n_max_seq);

/// <summary>
/// Free a KV cache view. (use only for debugging purposes)
/// </summary>
/// <param name="view"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe void llama_kv_cache_view_free(LLamaKvCacheView* view);

/// <summary>
/// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
/// </summary>
/// <param name="ctx"></param>
/// <param name="view"></param>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe void llama_kv_cache_view_update(SafeLLamaContextHandle ctx, LLamaKvCacheView* view);

/// <summary>
/// Returns the number of tokens in the KV cache (slow, use only for debug)
/// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_get_kv_cache_token_count(SafeLLamaContextHandle ctx);

/// <summary>
/// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
/// </summary>
/// <param name="ctx"></param>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern int llama_get_kv_cache_used_cells(SafeLLamaContextHandle ctx);
}

+ 61
- 0
LLama/Native/LLamaModelKvOverride.cs View File

@@ -0,0 +1,61 @@
using System.Runtime.InteropServices;

namespace LLama.Native;

/// <summary>
/// Override a key/value pair in the llama model metadata
/// </summary>
[StructLayout(LayoutKind.Explicit)]
public unsafe struct LLamaModelKvOverride
{
/// <summary>
/// Key to override
/// </summary>
[FieldOffset(0)]
public fixed char key[128];

/// <summary>
/// Type of value
/// </summary>
[FieldOffset(128)]
public LLamaModelKvOverrideType Tag;

/// <summary>
/// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_INT
/// </summary>
[FieldOffset(132)]
public long IntValue;

/// <summary>
/// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_FLOAT
/// </summary>
[FieldOffset(132)]
public double FloatValue;

/// <summary>
/// Value, **must** only be used if Tag == LLAMA_KV_OVERRIDE_BOOL
/// </summary>
[FieldOffset(132)]
public int BoolValue;
}

/// <summary>
/// Specifies what type of value is being overridden by LLamaModelKvOverride
/// </summary>
public enum LLamaModelKvOverrideType
{
/// <summary>
/// Overriding an int value
/// </summary>
LLAMA_KV_OVERRIDE_INT = 0,

/// <summary>
/// Overriding a float value
/// </summary>
LLAMA_KV_OVERRIDE_FLOAT = 1,

/// <summary>
/// Overriding a bool value
/// </summary>
LLAMA_KV_OVERRIDE_BOOL = 2,
}

+ 5
- 0
LLama/Native/LLamaModelParams.cs View File

@@ -34,6 +34,11 @@ namespace LLama.Native
/// </summary>
public void* progress_callback_user_data;

/// <summary>
/// override key-value pairs of the model meta data
/// </summary>
public LLamaModelKvOverride* kv_overrides;

/// <summary>
/// only load the vocabulary, no weights
/// </summary>


BIN
LLama/runtimes/deps/avx/libllama.dll View File


BIN
LLama/runtimes/deps/avx/libllama.so View File


BIN
LLama/runtimes/deps/avx2/libllama.dll View File


BIN
LLama/runtimes/deps/avx2/libllama.so View File


BIN
LLama/runtimes/deps/avx512/libllama.dll View File


BIN
LLama/runtimes/deps/avx512/libllama.so View File


BIN
LLama/runtimes/deps/cu11.7.1/libllama.dll View File


BIN
LLama/runtimes/deps/cu11.7.1/libllama.so View File


BIN
LLama/runtimes/deps/cu12.1.0/libllama.dll View File


BIN
LLama/runtimes/deps/cu12.1.0/libllama.so View File


BIN
LLama/runtimes/deps/libllama.dll View File


BIN
LLama/runtimes/deps/libllama.so View File


+ 2157
- 384
LLama/runtimes/deps/osx-arm64/ggml-metal.metal
File diff suppressed because it is too large
View File


BIN
LLama/runtimes/deps/osx-arm64/libllama.dylib View File


BIN
LLama/runtimes/deps/osx-x64/libllama.dylib View File


Loading…
Cancel
Save