From b0acecf080764684040a916ce7a6d7c79876ab45 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Sun, 21 Jan 2024 00:54:07 +0000
Subject: [PATCH 1/5] Created a new `BatchedExecutor` which processes multiple
 "Conversations" in one single inference batch. This is faster, even when the
 conversations are unrelated, and is much faster if the conversations share
 some overlap (e.g. a common system prompt prefix).

Conversations can be "forked", to create a copy of a conversation at a given point. This allows e.g. prompting a conversation with a system prefix just once and then forking it again and again for each individual conversation. Conversations can also be "rewound" to an earlier state.

Added two new examples, demonstrating forking and rewinding.
---
 LLama.Examples/Examples/BatchedDecoding.cs    | 172 ---------------
 .../Examples/BatchedExecutorFork.cs           | 138 ++++++++++++
 .../Examples/BatchedExecutorRewind.cs         | 121 ++++++++++
 LLama.Examples/Examples/Runner.cs             |   3 +-
 LLama/Batched/BatchedExecutor.cs              | 107 +++++++++
 LLama/Batched/Conversation.cs                 | 206 ++++++++++++++++++
 LLama/Batched/Exceptions.cs                   |  81 +++++++
 LLama/LLamaContext.cs                         |   4 +-
 LLama/LLamaInstructExecutor.cs                |   1 +
 LLama/LLamaInteractExecutor.cs                |   1 +
 LLama/Native/LLamaBatch.cs                    |  55 ++++-
 LLama/Sampling/BaseSamplingPipeline.cs        |  19 +-
 LLama/Sampling/DefaultSamplingPipeline.cs     |  26 ++-
 LLama/Sampling/ISamplingPipeline.cs           |  13 ++
 14 files changed, 748 insertions(+), 199 deletions(-)
 delete mode 100644 LLama.Examples/Examples/BatchedDecoding.cs
 create mode 100644 LLama.Examples/Examples/BatchedExecutorFork.cs
 create mode 100644 LLama.Examples/Examples/BatchedExecutorRewind.cs
 create mode 100644 LLama/Batched/BatchedExecutor.cs
 create mode 100644 LLama/Batched/Conversation.cs
 create mode 100644 LLama/Batched/Exceptions.cs
diff --git a/LLama.Examples/Examples/BatchedDecoding.cs b/LLama.Examples/Examples/BatchedDecoding.cs
deleted file mode 100644
index f893b613..00000000
--- a/LLama.Examples/Examples/BatchedDecoding.cs
+++ /dev/null
@@ -1,172 +0,0 @@
-﻿using System.Diagnostics;
-using System.Text;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-
-namespace LLama.Examples.Examples;
-
-/// <summary>
-/// This demonstrates generating multiple replies to the same prompt, with a shared cache
-/// </summary>
-/// <remarks>Note that this is currently using the low level API directly, future work will provide a safer C# wrapper over this!</remarks>
-public class BatchedDecoding
-{
-    private const int n_parallel = 8;
-    private const int n_len = 32;
-
-    public static async Task Run()
-    {
-        Console.Write("Please input your model path: ");
-        var modelPath = Console.ReadLine();
-
-        Console.WriteLine("Prompt (leave blank to select automatically):");
-        var prompt = Console.ReadLine();
-        if (string.IsNullOrWhiteSpace(prompt))
-            prompt = "Not many people know that";
-
-        // Load model
-        var parameters = new ModelParams(modelPath);
-
-        using var model = LLamaWeights.LoadFromFile(parameters);
-
-        // Tokenize prompt
-        var prompt_tokens = model.Tokenize(prompt, true, false, Encoding.UTF8);
-        var n_kv_req = prompt_tokens.Length + (n_len - prompt_tokens.Length) * n_parallel;
-
-        // Create a context
-        parameters.ContextSize = (uint)model.ContextSize;
-        parameters.BatchSize = (uint)Math.Max(n_len, n_parallel);
-        using var context = model.CreateContext(parameters);
-
-        var n_ctx = context.ContextSize;
-
-        // make sure the KV cache is big enough to hold all the prompt and generated tokens
-        if (n_kv_req > n_ctx)
-        {
-            await Console.Error.WriteLineAsync($"error: n_kv_req ({n_kv_req}) > n_ctx, the required KV cache size is not big enough\n");
-            await Console.Error.WriteLineAsync("        either reduce n_parallel or increase n_ctx\n");
-            return;
-        }
-
-        var batch = new LLamaBatch();
-
-        // evaluate the initial prompt
-        batch.AddRange(prompt_tokens, 0, LLamaSeqId.Zero, true);
-
-        if (await context.DecodeAsync(batch) != DecodeResult.Ok)
-        {
-            await Console.Error.WriteLineAsync("llama_decode failed");
-            return;
-        }
-
-        // assign the system KV cache to all parallel sequences
-        // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-        for (var i = 1; i < n_parallel; ++i)
-        {
-            context.NativeHandle.KvCacheSequenceCopy((LLamaSeqId)0, (LLamaSeqId)i, 0, batch.TokenCount);
-        }
-
-        if (n_parallel > 1)
-        {
-            Console.WriteLine();
-            Console.WriteLine($"generating {n_parallel} sequences...");
-        }
-
-        // remember the batch index of the last token for each parallel sequence
-        // we need this to determine which logits to sample from
-        List<int> i_batch = new();
-        for (var i = 0; i < n_parallel; i++)
-            i_batch.Add(batch.TokenCount - 1);
-
-        // Create per-stream decoder and sampler
-        var decoders = new StreamingTokenDecoder[n_parallel];
-        var samplers = new ISamplingPipeline[n_parallel];
-        for (var i = 0; i < n_parallel; i++)
-        {
-            decoders[i] = new StreamingTokenDecoder(context);
-            samplers[i] = new DefaultSamplingPipeline
-            {
-                Temperature = 0.1f + (float)i / n_parallel,
-                MinP = 0.25f,
-            };
-        }
-
-        var n_cur = batch.TokenCount;
-        var n_decode = 0;
-
-        var timer = new Stopwatch();
-        timer.Start();
-        while (n_cur <= n_len)
-        {
-            batch.Clear();
-
-            for (var i = 0; i < n_parallel; i++)
-            {
-                // Skip completed streams
-                if (i_batch[i] < 0)
-                    continue;
-
-                // Use the sampling pipeline to select a token
-                var new_token_id = samplers[i].Sample(
-                    context.NativeHandle,
-                    context.NativeHandle.GetLogitsIth(i_batch[i]),
-                    Array.Empty<LLamaToken>()
-                );
-
-                // Finish this stream early if necessary
-                if (new_token_id == model.EndOfSentenceToken || new_token_id == model.NewlineToken)
-                {
-                    i_batch[i] = -1;
-                    Console.WriteLine($"Completed Stream {i} early");
-                    continue;
-                }
-
-                // Add this token to the decoder, so it will be turned into text
-                decoders[i].Add(new_token_id);
-
-                i_batch[i] = batch.TokenCount;
-
-                // push this new token for next evaluation
-                batch.Add(new_token_id, n_cur, (LLamaSeqId)i, true);
-
-                n_decode++;
-            }
-
-            // Check if all streams are finished
-            if (batch.TokenCount == 0)
-            {
-                break;
-            }
-
-            n_cur++;
-
-            // evaluate the current batch with the transformer model
-            if (await context.DecodeAsync(batch) != 0)
-            {
-                await Console.Error.WriteLineAsync("failed to eval");
-                return;
-            }
-        }
-
-        timer.Stop();
-        Console.ForegroundColor = ConsoleColor.Yellow;
-        Console.WriteLine();
-        Console.WriteLine($"Decoded {n_decode} tokens in {timer.ElapsedMilliseconds}ms");
-        Console.WriteLine($"Rate: {n_decode / timer.Elapsed.TotalSeconds:##.000} tokens/second");
-
-        var index = 0;
-        foreach (var stream in decoders)
-        {
-            var text = stream.Read();
-
-            Console.ForegroundColor = ConsoleColor.Green;
-            Console.Write($"{index++}. {prompt}");
-            Console.ForegroundColor = ConsoleColor.Red;
-            Console.WriteLine(text);
-        }
-
-        Console.WriteLine("Press any key to exit demo");
-        Console.ReadKey(true);
-    }
-}
\ No newline at end of file
diff --git a/LLama.Examples/Examples/BatchedExecutorFork.cs b/LLama.Examples/Examples/BatchedExecutorFork.cs
new file mode 100644
index 00000000..b834190c
--- /dev/null
+++ b/LLama.Examples/Examples/BatchedExecutorFork.cs
@@ -0,0 +1,138 @@
+﻿using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+
+namespace LLama.Examples.Examples;
+
+/// <summary>
+/// This demonstrates generating multiple replies to the same prompt, with a shared cache
+/// </summary>
+public class BatchedExecutorFork
+{
+    private const int n_split = 16;
+    private const int n_len = 64;
+
+    public static async Task Run()
+    {
+        Console.Write("Please input your model path: ");
+        var modelPath = Console.ReadLine();
+
+        var parameters = new ModelParams(modelPath);
+        using var model = LLamaWeights.LoadFromFile(parameters);
+
+        Console.WriteLine("Prompt (leave blank to select automatically):");
+        var prompt = Console.ReadLine();
+        if (string.IsNullOrWhiteSpace(prompt))
+            prompt = "Not many people know that";
+
+        // Create an executor that can evaluate a batch of conversations together
+        var executor = new BatchedExecutor(model, parameters);
+
+        // Print some info
+        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+        Console.WriteLine($"Created executor with model: {name}");
+
+        // Evaluate the initial prompt to create one conversation
+        var start = executor.Prompt(prompt);
+        await executor.Infer();
+
+        // Create the root node of the tree
+        var root = new Node(start);
+
+        // Run inference loop
+        for (var i = 0; i < n_len; i++)
+        {
+            if (i != 0)
+                await executor.Infer();
+
+            // Occasionally fork all the active conversations
+            if (i != 0 && i % n_split == 0)
+                root.Split();
+
+            // Sample all active conversations
+            root.Sample();
+        }
+
+        Console.WriteLine($"{prompt}...");
+        root.Print(1);
+
+        Console.WriteLine("Press any key to exit demo");
+        Console.ReadKey(true);
+    }
+
+    class Node
+    {
+        private readonly StreamingTokenDecoder _decoder;
+
+        private readonly DefaultSamplingPipeline _sampler;
+        private Conversation? _conversation;
+
+        private Node? _left;
+        private Node? _right;
+
+        public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;
+
+        public Node(Conversation conversation)
+        {
+            _sampler = new DefaultSamplingPipeline();
+            _conversation = conversation;
+            _decoder = new StreamingTokenDecoder(conversation.Executor.Context);
+        }
+
+        public void Sample()
+        {
+            if (_conversation == null)
+            {
+                _left?.Sample();
+                _right?.Sample();
+                return;
+            }
+
+            if (_conversation.RequiresInference)
+                return;
+
+            // Sample one token
+            var ctx = _conversation.Executor.Context.NativeHandle;
+            var logitsCopy = _conversation.Sample().ToArray();
+            var token = _sampler.Sample(ctx, logitsCopy, Array.Empty<LLamaToken>());
+            _sampler.Accept(ctx, token);
+            _decoder.Add(token);
+
+            // Prompt the conversation with this token, to continue generating from there
+            _conversation.Prompt(token);
+        }
+
+        public void Split()
+        {
+            if (_conversation != null)
+            {
+                _left = new Node(_conversation.Fork());
+                _right = new Node(_conversation.Fork());
+
+                _conversation.Dispose();
+                _conversation = null;
+            }
+            else
+            {
+                _left?.Split();
+                _right?.Split();
+            }
+        }
+
+        public void Print(int indendation)
+        {
+            var colors = new[] { ConsoleColor.Red, ConsoleColor.Green, ConsoleColor.Blue, ConsoleColor.Yellow, ConsoleColor.White };
+            Console.ForegroundColor = colors[indendation % colors.Length];
+
+            var message = _decoder.Read().ReplaceLineEndings("");
+
+            var prefix = new string(' ', indendation * 3);
+            var suffix = _conversation == null ? "..." : "";
+            Console.WriteLine($"{prefix}...{message}{suffix}");
+
+            _left?.Print(indendation + 2);
+            _right?.Print(indendation + 2);
+        }
+    }
+}
\ No newline at end of file
diff --git a/LLama.Examples/Examples/BatchedExecutorRewind.cs b/LLama.Examples/Examples/BatchedExecutorRewind.cs
new file mode 100644
index 00000000..25195a56
--- /dev/null
+++ b/LLama.Examples/Examples/BatchedExecutorRewind.cs
@@ -0,0 +1,121 @@
+﻿using LLama.Batched;
+using LLama.Common;
+using LLama.Native;
+using LLama.Sampling;
+
+namespace LLama.Examples.Examples;
+
+/// <summary>
+/// This demonstrates generating tokens and then rewinding to an earlier state
+/// </summary>
+public class BatchedExecutorRewind
+{
+    private const int n_generate = 24;
+    private const int n_rewind = 12;
+    private const int n_repeats = 6;
+
+    public static async Task Run()
+    {
+        Console.Write("Please input your model path: ");
+        var modelPath = Console.ReadLine();
+
+        var parameters = new ModelParams(modelPath);
+        using var model = LLamaWeights.LoadFromFile(parameters);
+
+        Console.WriteLine("Prompt (leave blank to select automatically):");
+        var prompt = Console.ReadLine();
+        if (string.IsNullOrWhiteSpace(prompt))
+            prompt = "Not many people know that";
+
+        // Create an executor that can evaluate a batch of conversations together
+        var executor = new BatchedExecutor(model, parameters);
+
+        // Print some info
+        var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
+        Console.WriteLine($"Created executor with model: {name}");
+
+        // Evaluate the initial prompt to create one conversation
+        var conversation = executor.Prompt(prompt);
+        
+        // Create the start node wrapping the conversation
+        var node = new Node(executor.Context);
+
+        // Print the prompt
+        Console.ForegroundColor = ConsoleColor.Green;
+        Console.WriteLine(prompt);
+
+        for (var i = 0; i < n_repeats; i++)
+        {
+            for (var j = 0; j < n_generate; j++)
+            {
+                // Run inference
+                await executor.Infer();
+
+                // Sample a token
+                var token = node.Sample(conversation);
+
+                // Continue conversation with this token
+                if (j != n_generate - 1)
+                    conversation.Prompt(token);
+            }
+
+            // Write out what we generated
+            node.Write(n_rewind, i + 1);
+
+            // Rewind back a few tokens
+            conversation.Rewind(n_rewind + 1);
+
+            // Prompt with a token
+            conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));
+
+            // Create a new node around the rewound conversation
+            node = new Node(executor.Context);
+        }
+
+        Console.WriteLine("Press any key to exit demo");
+        Console.ReadKey(true);
+    }
+
+    private class Node
+    {
+        private readonly LLamaContext _context;
+
+        private readonly List<LLamaToken> _tokens = new List<LLamaToken>();
+        private readonly DefaultSamplingPipeline Sampler;
+
+        public Node(LLamaContext context)
+        {
+            _context = context;
+            Sampler = new DefaultSamplingPipeline();
+        }
+
+        public LLamaToken Sample(Conversation conversation)
+        {
+            var token = Sampler.Sample(_context.NativeHandle, conversation.Sample().ToArray(), Array.Empty<LLamaToken>());
+            _tokens.Add(token);
+            return token;
+        }
+
+        public void Write(int n_rewind, int depth)
+        {
+            var decoder = new StreamingTokenDecoder(_context);
+
+            for (var i = 0; i < _tokens.Count - n_rewind; i++)
+                decoder.Add(_tokens[i]);
+
+            Console.ForegroundColor = ConsoleColor.Green;
+            Console.Write(new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" "));
+
+            for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
+                decoder.Add(_tokens[i]);
+
+            Console.ForegroundColor = ConsoleColor.DarkRed;
+            Console.WriteLine(decoder.Read().ReplaceLineEndings(" "));
+        }
+
+        public LLamaToken GetToken(int index)
+        {
+            return _tokens[index];
+        }
+    }
+}
\ No newline at end of file
diff --git a/LLama.Examples/Examples/Runner.cs b/LLama.Examples/Examples/Runner.cs
index 3d9858e1..54358ce7 100644
--- a/LLama.Examples/Examples/Runner.cs
+++ b/LLama.Examples/Examples/Runner.cs
@@ -23,7 +23,8 @@ public class Runner
         { "Semantic Kernel Chat.", SemanticKernelChat.Run },
         { "Semantic Kernel Memory.", SemanticKernelMemory.Run },
         { "Coding Assistant.", CodingAssistant.Run },
-        { "Batch Decoding.", BatchedDecoding.Run },
+        { "Batched Executor (Fork)", BatchedExecutorFork.Run },
+        { "Batched Executor (Rewind)", BatchedExecutorRewind.Run },
         { "SK Kernel Memory.", KernelMemory.Run },
         { "Exit", async () => Environment.Exit(0) }
     };
diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
new file mode 100644
index 00000000..2fc3eabf
--- /dev/null
+++ b/LLama/Batched/BatchedExecutor.cs
@@ -0,0 +1,107 @@
+﻿using System;
+using System.Threading;
+using System.Threading.Tasks;
+using LLama.Abstractions;
+using LLama.Native;
+
+namespace LLama.Batched;
+
+/// <summary>
+/// A batched executor that can infer multiple separate "conversations" simultaneously.
+/// </summary>
+public sealed class BatchedExecutor
+    : IDisposable
+{
+    private int _nextSequenceId;
+
+    internal LLamaBatch Batch { get; }
+    internal ulong Epoch { get; private set; }
+
+    /// <summary>
+    /// The <see cref="LLamaContext"/> this executor is using
+    /// </summary>
+    public LLamaContext Context { get; }
+
+    /// <summary>
+    /// The <see cref="LLamaWeights"/> this executor is using
+    /// </summary>
+    public LLamaWeights Model { get; }
+
+    /// <summary>
+    /// Get the number of tokens in the batch, waiting for <see cref="Infer"/> to be called
+    /// </summary>
+    public int BatchedTokenCount => Batch.TokenCount;
+
+    /// <summary>
+    /// Check if this executor has been disposed.
+    /// </summary>
+    public bool IsDisposed { get; private set; }
+
+    /// <summary>
+    /// Create a new batched executor
+    /// </summary>
+    /// <param name="model">The model to use</param>
+    /// <param name="contextParams">Parameters to create a new context</param>
+    public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
+    {
+        Model = model;
+
+        Batch = new LLamaBatch();
+        Context = model.CreateContext(contextParams);
+    }
+
+    /// <summary>
+    /// Start a new <see cref="Conversation"/> with the given prompt
+    /// </summary>
+    /// <param name="prompt"></param>
+    /// <returns></returns>
+    public Conversation Prompt(string prompt)
+    {
+        if (IsDisposed)
+            throw new ObjectDisposedException(nameof(BatchedExecutor));
+
+        var conversation = new Conversation(this, GetNextSequenceId(), 0);
+        conversation.Prompt(prompt);
+
+        return conversation;
+    }
+
+    /// <summary>
+    /// Run inference for all conversations in the batch which have pending tokens.
+    ///
+    /// If the result is `NoKvSlot` then there is not enough memory for inference, try disposing some conversation
+    /// threads and running inference again.
+    /// </summary>
+    public async Task<DecodeResult> Infer(CancellationToken cancellation = default)
+    {
+        if (IsDisposed)
+            throw new ObjectDisposedException(nameof(BatchedExecutor));
+
+        var status = await Context.DecodeAsync(Batch, cancellation);
+
+        // Only clear the batch if the result was ok. leaving all this state in place means that "Infer" can
+        // be called again after a warning (e.g. NoKvSlot).
+        if (status == DecodeResult.Ok)
+        {
+            Epoch++;
+            Batch.Clear();
+        }
+
+        return status;
+    }
+
+    /// <inheritdoc />
+    public void Dispose()
+    {
+        if (IsDisposed)
+            return;
+        IsDisposed = true;
+
+        Context.Dispose();
+    }
+
+    internal LLamaSeqId GetNextSequenceId()
+    {
+        return checked((LLamaSeqId)_nextSequenceId++);
+    }
+}
\ No newline at end of file
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
new file mode 100644
index 00000000..3dcdec54
--- /dev/null
+++ b/LLama/Batched/Conversation.cs
@@ -0,0 +1,206 @@
+﻿using System;
+using System.Collections.Generic;
+using LLama.Native;
+
+namespace LLama.Batched;
+
+/// <summary>
+/// A single conversation thread that can be prompted (adding tokens from the user) or inferred (extracting a token from the LLM)
+/// </summary>
+public sealed class Conversation
+    : IDisposable
+{
+    private ulong _requiredEpoch;
+    private LLamaPos _end;
+    private int _batchIndex;
+    private bool _disposed;
+
+    /// <summary>
+    /// The executor which this conversation belongs to
+    /// </summary>
+    public BatchedExecutor Executor { get; }
+
+    /// <summary>
+    /// Unique ID for this conversation
+    /// </summary>
+    public LLamaSeqId ConversationId { get; }
+
+    /// <summary>
+    /// Total number of tokens in this conversation, cannot exceed the context length.
+    /// </summary>
+    public int TokenCount => _end.Value;
+
+    /// <summary>
+    /// Indicates if this conversation has been disposed, nothing can be done with a disposed conversation
+    /// </summary>
+    public bool IsDisposed => _disposed || Executor.IsDisposed;
+
+    /// <summary>
+    /// Indicates if this conversation is waiting for inference to be run on the executor. "Prompt" and "Sample" cannot be called when this is true.
+    /// </summary>
+    public bool RequiresInference => _requiredEpoch > Executor.Epoch;
+
+    /// <summary>
+    /// Indicates that this conversation should be sampled.
+    /// </summary>
+    public bool RequiresSampling => _requiredEpoch == Executor.Epoch;
+
+    #region construction/destruction
+    internal Conversation(BatchedExecutor batch, LLamaSeqId id, LLamaPos end)
+    {
+        ConversationId = id;
+        Executor = batch;
+
+        _end = end;
+    }
+
+    /// <summary>
+    /// End this conversation, freeing all resources used by it
+    /// </summary>
+    /// <exception cref="ObjectDisposedException"></exception>
+    public void Dispose()
+    {
+        if (IsDisposed)
+            return;
+        _disposed = true;
+
+        // Remove this conversation from the KV cache
+        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, 0, _end);
+    }
+
+    private void AssertNotDisposed()
+    {
+        if (Executor.IsDisposed)
+            throw new ObjectDisposedException(nameof(BatchedExecutor));
+        if (IsDisposed)
+            throw new ObjectDisposedException(nameof(Conversation));
+    }
+    #endregion
+
+    /// <summary>
+    /// Create a copy of the current conversation
+    /// </summary>
+    /// <remarks>The copy shares internal state, so consumes very little extra memory.</remarks>
+    /// <returns></returns>
+    /// <exception cref="ObjectDisposedException"></exception>
+    public Conversation Fork()
+    {
+        AssertNotDisposed();
+
+        if (RequiresInference)
+            throw new CannotForkWhileRequiresInference();
+
+        // Assign tokens to the new sequence
+        var id2 = Executor.GetNextSequenceId();
+        NativeApi.llama_kv_cache_seq_cp(Executor.Context.NativeHandle, ConversationId, id2, 0, _end);
+
+        // Create a new conversation which references the current position in this one
+        var c = new Conversation(Executor, id2, _end)
+        {
+            _batchIndex = _batchIndex,
+            _requiredEpoch = _requiredEpoch,
+        };
+
+        return c;
+    }
+
+    /// <summary>
+    /// Rewind this conversation back to an earlier state
+    /// </summary>
+    /// <param name="tokens"></param>
+    /// <exception cref="ObjectDisposedException"></exception>
+    /// <exception cref="CannotForkWhileRequiresInference"></exception>
+    /// <exception cref="ArgumentOutOfRangeException">Thrown if `tokens` parameter is larger than NTokens</exception>
+    public void Rewind(int tokens)
+    {
+        AssertNotDisposed();
+
+        if (tokens > TokenCount)
+            throw new ArgumentOutOfRangeException(nameof(tokens), "Cannot rewind more than the total number of tokens");
+
+        // Remove those tokens from KV
+        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, _end.Value - tokens, _end);
+
+        // Adjust "end" marker back
+        _end = _end.Value - tokens;
+    }
+
+    #region sample
+    /// <summary>
+    /// Get the logits from this conversation, ready for sampling
+    /// </summary>
+    /// <returns></returns>
+    /// <exception cref="ObjectDisposedException"></exception>
+    /// <exception cref="CannotSampleRequiresPromptException">Thrown if this conversation was not prompted before the previous call to infer</exception>
+    /// <exception cref="CannotSampleRequiresInferenceException">Thrown if Infer() must be called on the executor</exception>
+    public ReadOnlySpan<float> Sample()
+    {
+        AssertNotDisposed();
+
+        if (_requiredEpoch < Executor.Epoch)
+            throw new CannotSampleRequiresPromptException();
+        if (_requiredEpoch > Executor.Epoch)
+            throw new CannotSampleRequiresInferenceException();
+
+        return Executor.Context.NativeHandle.GetLogitsIth(_batchIndex);
+    }
+    #endregion
+
+    #region prompt
+    private void AssertCanBePrompted()
+    {
+        AssertNotDisposed();
+
+        if (RequiresInference)
+            throw new AlreadyPromptedConversationException();
+    }
+
+    /// <summary>
+    /// Add tokens to this conversation
+    /// </summary>
+    /// <param name="input"></param>
+    /// <returns></returns>
+    public void Prompt(string input)
+    {
+        AssertCanBePrompted();
+
+        Prompt(Executor.Context.Tokenize(input));
+    }
+
+    /// <summary>
+    /// Add tokens to this conversation
+    /// </summary>
+    /// <param name="tokens"></param>
+    /// <returns></returns>
+    /// <exception cref="ObjectDisposedException"></exception>
+    public void Prompt(IReadOnlyList<LLamaToken> tokens)
+    {
+        AssertCanBePrompted();
+
+        // Add the prompt to the batch
+        for (var i = 0; i < tokens.Count; i++)
+            _batchIndex = Executor.Batch.Add(tokens[i], _end++, ConversationId, i == tokens.Count - 1);
+
+        // Mark this conversation as needing inference/sampling
+        _requiredEpoch = Executor.Epoch + 1;
+    }
+
+    /// <summary>
+    /// Add a single token to this conversation
+    /// </summary>
+    /// <param name="token"></param>
+    /// <returns></returns>
+    /// <exception cref="ObjectDisposedException"></exception>
+    /// <exception cref="InvalidOperationException"></exception>
+    public void Prompt(LLamaToken token)
+    {
+        AssertCanBePrompted();
+
+        // Add this token as input
+        _batchIndex = Executor.Batch.Add(token, _end++, ConversationId, true);
+
+        // Mark this conversation as needing inference/sampling
+        _requiredEpoch = Executor.Epoch + 1;
+    }
+    #endregion
+}
\ No newline at end of file
diff --git a/LLama/Batched/Exceptions.cs b/LLama/Batched/Exceptions.cs
new file mode 100644
index 00000000..7d847bfe
--- /dev/null
+++ b/LLama/Batched/Exceptions.cs
@@ -0,0 +1,81 @@
+﻿using System;
+
+namespace LLama.Batched;
+
+/// <summary>
+/// Base class for exceptions thrown from <see cref="BatchedExecutor"/>
+/// </summary>
+public class ExperimentalBatchedExecutorException
+    : Exception
+{
+    internal ExperimentalBatchedExecutorException(string message)
+        : base(message)
+    {
+    }
+}
+
+/// <summary>
+/// This exception is thrown when "Prompt()" is called on a <see cref="Conversation"/> which has
+/// already been prompted and before "Infer()" has been called on the associated
+/// <see cref="BatchedExecutor"/>.
+/// </summary>
+public class AlreadyPromptedConversationException
+    : ExperimentalBatchedExecutorException
+{
+    internal AlreadyPromptedConversationException()
+        : base("Must call `Infer()` before prompting this Conversation again")
+    {
+    }
+}
+
+/// <summary>
+/// This exception is thrown when "Sample()" is called on a <see cref="Conversation"/> which has
+/// already been prompted and before "Infer()" has been called on the associated
+/// <see cref="BatchedExecutor"/>.
+/// </summary>
+public class CannotSampleRequiresInferenceException
+    : ExperimentalBatchedExecutorException
+{
+    internal CannotSampleRequiresInferenceException()
+        : base("Must call `Infer()` before sampling from this Conversation")
+    {
+    }
+}
+
+/// <summary>
+/// This exception is thrown when "Sample()" is called on a <see cref="Conversation"/> which was not
+/// first prompted.
+/// <see cref="BatchedExecutor"/>.
+/// </summary>
+public class CannotSampleRequiresPromptException
+    : ExperimentalBatchedExecutorException
+{
+    internal CannotSampleRequiresPromptException()
+        : base("Must call `Prompt()` and then `Infer()` before sampling from this Conversation")
+    {
+    }
+}
+
+/// <summary>
+/// This exception is thrown when "Fork()" is called on a <see cref="Conversation"/> with <see cref="Conversation.RequiresInference"/> = true
+/// </summary>
+public class CannotForkWhileRequiresInference
+    : ExperimentalBatchedExecutorException
+{
+    internal CannotForkWhileRequiresInference()
+        : base("Cannot `Fork()` a conversation while RequiresInference is true")
+    {
+    }
+}
+
+/// <summary>
+/// This exception is thrown when "Rewind()" is called on a <see cref="Conversation"/> with <see cref="Conversation.RequiresInference"/> = true
+/// </summary>
+public class CannotRewindWhileRequiresInference
+    : ExperimentalBatchedExecutorException
+{
+    internal CannotRewindWhileRequiresInference()
+        : base("Cannot `Rewind()` a conversation while RequiresInference is true")
+    {
+    }
+}
\ No newline at end of file
diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
index 5d026b67..9075c89f 100644
--- a/LLama/LLamaContext.cs
+++ b/LLama/LLamaContext.cs
@@ -221,7 +221,9 @@ namespace LLama
         /// <returns>The selected token</returns>
         public LLamaToken Sample(ISamplingPipeline pipeline, ReadOnlySpan<LLamaToken> lastTokens)
         {
-            return pipeline.Sample(NativeHandle, NativeHandle.GetLogits(), lastTokens);
+            var token = pipeline.Sample(NativeHandle, NativeHandle.GetLogits(), lastTokens);
+            pipeline.Accept(NativeHandle, token);
+            return token;
         }
 
         /// <summary>
diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs
index 969da783..31e975ca 100644
--- a/LLama/LLamaInstructExecutor.cs
+++ b/LLama/LLamaInstructExecutor.cs
@@ -213,6 +213,7 @@ namespace LLama
                 if (inferenceParams.SamplingPipeline is not null)
                 {
                     id = inferenceParams.SamplingPipeline.Sample(Context.NativeHandle, Context.NativeHandle.GetLogits(), _last_n_tokens.ToArray());
+                    inferenceParams.SamplingPipeline.Accept(Context.NativeHandle, id);
                 }
                 else
                 {
diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
index 7d742c81..9338d839 100644
--- a/LLama/LLamaInteractExecutor.cs
+++ b/LLama/LLamaInteractExecutor.cs
@@ -192,6 +192,7 @@ namespace LLama
                 if (inferenceParams.SamplingPipeline is not null)
                 {
                     id = inferenceParams.SamplingPipeline.Sample(Context.NativeHandle, Context.NativeHandle.GetLogits(), _last_n_tokens.ToArray());
+                    inferenceParams.SamplingPipeline.Accept(Context.NativeHandle, id);
                 }
                 else
                 {
diff --git a/LLama/Native/LLamaBatch.cs b/LLama/Native/LLamaBatch.cs
index 532e16ff..50b9c8f0 100644
--- a/LLama/Native/LLamaBatch.cs
+++ b/LLama/Native/LLamaBatch.cs
@@ -18,6 +18,11 @@ public class LLamaBatch
     private LLamaSeqId[][] _sequenceIds;
     private IntPtr[] _sequenceIdsPtrs;
 
+    /// <summary>
+    /// Keep track of the index of existing token/position combos in the batch
+    /// </summary>
+    private readonly Dictionary<(LLamaToken, LLamaPos), int> _index = new();
+
     /// <summary>
     /// The number of tokens in this batch
     /// </summary>
@@ -130,23 +135,44 @@ public class LLamaBatch
     /// <param name="pos">The position to add it att</param>
     /// <param name="sequences">The set of sequences to add this token to</param>
     /// <param name="logits"></param>
-    public void Add(LLamaToken token, LLamaPos pos, ReadOnlySpan<LLamaSeqId> sequences, bool logits)
+    /// <returns>The index that the token was added at. Use this for GetLogitsIth</returns>
+    public int Add(LLamaToken token, LLamaPos pos, ReadOnlySpan<LLamaSeqId> sequences, bool logits)
     {
+        // Try to find this (token, position) combo somewhere in the batch to re-use it
+        if (_index.TryGetValue((token, pos), out var existingIndex))
+        {
+            if (_sequenceIdCount[existingIndex] + sequences.Length > SequenceCapacity)
+                GrowMaxSequences(_sequenceIdCount[existingIndex] + sequences.Length);
+
+            foreach (var sequence in sequences)
+            {
+                _sequenceIds[existingIndex][_sequenceIdCount[existingIndex]] = sequence;
+                _sequenceIdCount[existingIndex]++;
+            }
+
+            return existingIndex;
+        }
+
+        // Couldn't find this it in the batch, add a new item
+
+        // Frow capacity as necessary
         if (TokenCount == TokenCapacity)
             GrowTokenCapacity();
         if (sequences.Length > SequenceCapacity)
             GrowMaxSequences(sequences.Length);
 
+        // Store the position in the index, so it can be found later
+        _index.Add((token, pos), TokenCount);
+
+        // Add the items to the arrays
         _tokens[TokenCount] = token;
         _positions[TokenCount] = pos;
-
         _sequenceIdCount[TokenCount] = sequences.Length;
         for (var i = 0; i < sequences.Length; i++)
             _sequenceIds[TokenCount][i] = sequences[i];
-
         _logits[TokenCount] = Convert.ToByte(logits);
 
-        TokenCount++;
+        return TokenCount++;
     }
 
     /// <summary>
@@ -157,11 +183,12 @@ public class LLamaBatch
     /// <param name="pos">The position to add it att</param>
     /// <param name="sequences">The set of sequences to add this token to</param>
     /// <param name="logits"></param>
-    public void Add(LLamaToken token, LLamaPos pos, List<LLamaSeqId> sequences, bool logits)
+    /// <returns>The index that the token was added at. Use this for GetLogitsIth</returns>
+    public int Add(LLamaToken token, LLamaPos pos, List<LLamaSeqId> sequences, bool logits)
     {
 #if NET5_0_OR_GREATER
         var seqSpan = CollectionsMarshal.AsSpan(sequences);
-        Add(token, pos, seqSpan, logits);
+        return Add(token, pos, seqSpan, logits);
 #else
         // on netstandard2.0 we can't use CollectionsMarshal to get directly at the internal memory of
         // the list. Instead rent an array and copy the data into it. This avoids an allocation, but can't
@@ -171,7 +198,7 @@ public class LLamaBatch
         try
         {
             sequences.CopyTo(rented, 0);
-            Add(token, pos, rented.AsSpan(0, sequences.Count), logits);
+            return Add(token, pos, rented.AsSpan(0, sequences.Count), logits);
         }
         finally
         {
@@ -188,14 +215,15 @@ public class LLamaBatch
     /// <param name="pos">The position to add it att</param>
     /// <param name="sequence">The sequence to add this token to</param>
     /// <param name="logits"></param>
-    public void Add(LLamaToken token, LLamaPos pos, LLamaSeqId sequence, bool logits)
+    /// <returns>The index that the token was added at. Use this for GetLogitsIth</returns>
+    public int Add(LLamaToken token, LLamaPos pos, LLamaSeqId sequence, bool logits)
     {
         // Create a temporary span to contain 1 item without allocating
         Span<LLamaSeqId> sequences = stackalloc LLamaSeqId[1];
         sequences[0] = sequence;
 
         // Add it
-        Add(token, pos, sequences, logits);
+        return Add(token, pos, sequences, logits);
     }
 
     /// <summary>
@@ -205,13 +233,17 @@ public class LLamaBatch
     /// <param name="start">The starting position to add tokens at</param>
     /// <param name="sequence">The sequence to add this token to</param>
     /// <param name="logitsLast">Whether the final token should generate logits</param>
-    public void AddRange(ReadOnlySpan<LLamaToken> tokens, LLamaPos start, LLamaSeqId sequence, bool logitsLast)
+    /// <returns>The index that the final token was added at. Use this for GetLogitsIth</returns>
+    public int AddRange(ReadOnlySpan<LLamaToken> tokens, LLamaPos start, LLamaSeqId sequence, bool logitsLast)
     {
+        var last = -1;
         for (var i = 0; i < tokens.Length; i++)
         {
             var logits = (i == tokens.Length - 1) & logitsLast;
-            Add(tokens[i], start.Value + i, sequence, logits);
+            last = Add(tokens[i], start.Value + i, sequence, logits);
         }
+
+        return last;
     }
 #endregion
 
@@ -221,5 +253,6 @@ public class LLamaBatch
     public void Clear()
     {
         TokenCount = 0;
+        _index.Clear();
     }
 }
\ No newline at end of file
diff --git a/LLama/Sampling/BaseSamplingPipeline.cs b/LLama/Sampling/BaseSamplingPipeline.cs
index a41aa67e..b86001aa 100644
--- a/LLama/Sampling/BaseSamplingPipeline.cs
+++ b/LLama/Sampling/BaseSamplingPipeline.cs
@@ -40,10 +40,7 @@ public abstract class BaseSamplingPipeline
             var candidates = LLamaTokenDataArray.Create(logits);
 
             // Process token data array
-            ProcessTokenDataArray(ctx, candidates, lastTokens);
-
-            // Choose the final value
-            return ChooseToken(ctx, candidates);
+            return ProcessTokenDataArray(ctx, candidates, lastTokens);
         }
         finally
         {
@@ -53,6 +50,9 @@ public abstract class BaseSamplingPipeline
         }
     }
 
+    /// <inheritdoc />
+    public abstract void Accept(SafeLLamaContextHandle ctx, LLamaToken token);
+
     #region protected tokens
     /// <summary>
     /// Get all of the "protected" tokens that cannot be changed by ProcessLogits
@@ -107,19 +107,14 @@ public abstract class BaseSamplingPipeline
     /// <returns></returns>
     protected abstract LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan<LLamaToken> lastTokens);
 
-    /// <summary>
-    /// Choose the final token from the candidates
-    /// </summary>
-    /// <param name="ctx"></param>
-    /// <param name="candidates"></param>
-    /// <returns></returns>
-    protected abstract LLamaToken ChooseToken(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates);
-
     /// <inheritdoc/>
     public virtual void Reset()
     {
     }
 
+    /// <inheritdoc />
+    public abstract ISamplingPipeline Clone();
+
     /// <inheritdoc/>
     public virtual void Dispose()
     {
diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs
index b0fb5c59..531f34fa 100644
--- a/LLama/Sampling/DefaultSamplingPipeline.cs
+++ b/LLama/Sampling/DefaultSamplingPipeline.cs
@@ -141,9 +141,31 @@ public sealed class DefaultSamplingPipeline
         return id;
     }
 
+    public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
+    {
+        Grammar?.AcceptToken(ctx, token);
+    }
+
     /// <inheritdoc />
-    protected override LLamaToken ChooseToken(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates)
+    public override ISamplingPipeline Clone()
     {
-        return candidates.SampleToken(ctx);
+        var clone = new DefaultSamplingPipeline();
+
+        foreach (var (k, v) in LogitBias)
+            clone.LogitBias.Add(k, v);
+
+        clone.Grammar = Grammar?.Clone();
+        clone.RepeatPenalty = RepeatPenalty;
+        clone.AlphaFrequency = AlphaFrequency;
+        clone.AlphaPresence = AlphaPresence;
+        clone.Temperature = Temperature;
+        clone.TopK = TopK;
+        clone.TailFreeZ = TailFreeZ;
+        clone.TypicalP = TypicalP;
+        clone.TopP = TopP;
+        clone.MinP = MinP;
+        clone.PenalizeNewline = PenalizeNewline;
+
+        return clone;
     }
 }
\ No newline at end of file
diff --git a/LLama/Sampling/ISamplingPipeline.cs b/LLama/Sampling/ISamplingPipeline.cs
index be139879..b538d1fe 100644
--- a/LLama/Sampling/ISamplingPipeline.cs
+++ b/LLama/Sampling/ISamplingPipeline.cs
@@ -21,10 +21,23 @@ public interface ISamplingPipeline
     /// <returns></returns>
     LLamaToken Sample(SafeLLamaContextHandle ctx, Span<float> logits, ReadOnlySpan<LLamaToken> lastTokens);
 
+    /// <summary>
+    /// Update the pipeline, with knowledge that a particular token was just accepted
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <param name="token"></param>
+    void Accept(SafeLLamaContextHandle ctx, LLamaToken token);
+
     /// <summary>
     /// Reset all internal state of the sampling pipeline
     /// </summary>
     void Reset();
+
+    /// <summary>
+    /// Create a copy of this sampling pipeline
+    /// </summary>
+    /// <returns></returns>
+    ISamplingPipeline Clone();
 }
 
 /// <summary>

From 949861a5816df3122eda3663703caff1f6688ff8 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Sun, 11 Feb 2024 23:20:05 +0000
Subject: [PATCH 2/5] - Added a `Modify` method to `Conversation`. This grants
 **temporary** access to directly modify the KV cache.  - Re-implmented
 `Rewind` as an extension method using `Modify` internally  - Implemented
 `ShiftLeft`, which shifts everything over except for some starting tokens.
 This is the same as the `StatelessExecutor` out-of-context handling.  -
 Starting batch at epoch 1, this ensures that conversations (starting at zero)
 are below the current epoch. It also means `0` can always be used as a value
 guaranteed to be below the current epoch.

---
 LLama/Batched/BatchedExecutor.cs        |   2 +-
 LLama/Batched/Conversation.cs           | 115 ++++++++++++++++++------
 LLama/Batched/ConversationExtensions.cs |  59 ++++++++++++
 LLama/Batched/Exceptions.cs             |  10 +--
 LLama/Native/NativeApi.cs               |   2 +-
 LLama/Native/SafeLLamaContextHandle.cs  |   2 +-
 6 files changed, 156 insertions(+), 34 deletions(-)
 create mode 100644 LLama/Batched/ConversationExtensions.cs

diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
index 2fc3eabf..2009a687 100644
--- a/LLama/Batched/BatchedExecutor.cs
+++ b/LLama/Batched/BatchedExecutor.cs
@@ -45,9 +45,9 @@ public sealed class BatchedExecutor
     public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
     {
         Model = model;
-
         Batch = new LLamaBatch();
         Context = model.CreateContext(contextParams);
+        Epoch = 1;
     }
 
     /// <summary>
diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index 3dcdec54..a699bb66 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -90,39 +90,17 @@ public sealed class Conversation
         if (RequiresInference)
             throw new CannotForkWhileRequiresInference();
 
-        // Assign tokens to the new sequence
-        var id2 = Executor.GetNextSequenceId();
-        NativeApi.llama_kv_cache_seq_cp(Executor.Context.NativeHandle, ConversationId, id2, 0, _end);
-
         // Create a new conversation which references the current position in this one
-        var c = new Conversation(Executor, id2, _end)
+        var c = new Conversation(Executor, Executor.GetNextSequenceId(), _end)
         {
             _batchIndex = _batchIndex,
             _requiredEpoch = _requiredEpoch,
         };
 
-        return c;
-    }
-
-    /// <summary>
-    /// Rewind this conversation back to an earlier state
-    /// </summary>
-    /// <param name="tokens"></param>
-    /// <exception cref="ObjectDisposedException"></exception>
-    /// <exception cref="CannotForkWhileRequiresInference"></exception>
-    /// <exception cref="ArgumentOutOfRangeException">Thrown if `tokens` parameter is larger than NTokens</exception>
-    public void Rewind(int tokens)
-    {
-        AssertNotDisposed();
-
-        if (tokens > TokenCount)
-            throw new ArgumentOutOfRangeException(nameof(tokens), "Cannot rewind more than the total number of tokens");
-
-        // Remove those tokens from KV
-        Executor.Context.NativeHandle.KvCacheRemove(ConversationId, _end.Value - tokens, _end);
+        // Assign tokens to the new sequence
+        NativeApi.llama_kv_cache_seq_cp(Executor.Context.NativeHandle, ConversationId, c.ConversationId, 0, _end);
 
-        // Adjust "end" marker back
-        _end = _end.Value - tokens;
+        return c;
     }
 
     #region sample
@@ -203,4 +181,89 @@ public sealed class Conversation
         _requiredEpoch = Executor.Epoch + 1;
     }
     #endregion
+
+    #region modify
+    /// <summary>
+    /// Directly modify the KV cache of this conversation
+    /// </summary>
+    /// <param name="modifier"></param>
+    /// <exception cref="CannotModifyWhileRequiresInference">Thrown if this method is called while <see cref="Conversation.RequiresInference"/> == true</exception>
+    public void Modify(ModifyKvCache modifier)
+    {
+        AssertNotDisposed();
+
+        if (RequiresInference)
+            throw new CannotModifyWhileRequiresInference();
+
+        // do whatever the modification is
+        _end = modifier.Invoke(_end, new KvAccessor(this));
+
+        // Set the epoch down to zero, this ensures that this conversation
+        // cannot be sampled until it is prompted again.
+        _requiredEpoch = 0;
+    }
+
+    /// <summary>
+    /// Provides direct access to the KV cache of a <see cref="Conversation"/>.
+    /// See <see cref="Modify"/> for how to use this.
+    /// </summary>
+    public readonly ref struct KvAccessor
+    {
+        private readonly Conversation _conversation;
+
+        internal KvAccessor(Conversation conversation)
+        {
+            _conversation = conversation;
+        }
+
+        #region remove
+        /// <summary>
+        /// Removes all tokens that have positions in [start, end)
+        /// </summary>
+        /// <param name="start">Start position (inclusive)</param>
+        /// <param name="end">End position (exclusive)</param>
+        public void Remove(LLamaPos start, LLamaPos end)
+        {
+            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+        }
+
+        /// <summary>
+        /// Removes all tokens starting from the given position
+        /// </summary>
+        /// <param name="start">Start position (inclusive)</param>
+        /// <param name="count">Number of tokens</param>
+        public void Remove(LLamaPos start, int count)
+        {
+            if (count <= 0)
+                return;
+
+            var end = start.Value + count;
+            _conversation.Executor.Context.NativeHandle.KvCacheRemove(_conversation.ConversationId, start, end);
+        }
+        #endregion
+
+        #region shift
+        /// <summary>
+        /// Adds relative position "delta" to all tokens that have positions in [p0, p1).
+        /// If the KV cache is RoPEd, the KV data is updated
+        /// accordingly
+        /// </summary>
+        /// <param name="start">Start position (inclusive)</param>
+        /// <param name="end">End position (exclusive)</param>
+        /// <param name="delta">Amount to add on to each token position</param>
+        public void Shift(LLamaPos start, LLamaPos end, int delta)
+        {
+            _conversation.Executor.Context.NativeHandle.KvCacheSequenceShift(_conversation.ConversationId, start, end, delta);
+        }
+        #endregion
+    }
+
+    /// <summary>
+    /// A function which can temporarily access the KV cache of a <see cref="Conversation"/> to modify it directly
+    /// </summary>
+    /// <param name="end">The current end token of this conversation</param>
+    /// <param name="kv">An <see cref="KvAccessor"/> which allows direct access to modify the KV cache</param>
+    /// <returns>The new end token position</returns>
+    public delegate LLamaPos ModifyKvCache(LLamaPos end, KvAccessor kv);
+    #endregion
 }
\ No newline at end of file
diff --git a/LLama/Batched/ConversationExtensions.cs b/LLama/Batched/ConversationExtensions.cs
new file mode 100644
index 00000000..5fca5e94
--- /dev/null
+++ b/LLama/Batched/ConversationExtensions.cs
@@ -0,0 +1,59 @@
+﻿using System;
+
+namespace LLama.Batched;
+
+/// <summary>
+/// Extension method for <see cref="Conversation"/>
+/// </summary>
+public static class ConversationExtensions
+{
+    /// <summary>
+    /// Rewind a <see cref="Conversation"/> back to an earlier state by removing tokens from the end
+    /// </summary>
+    /// <param name="conversation">The conversation to rewind</param>
+    /// <param name="tokens">The number of tokens to rewind</param>
+    /// <exception cref="ArgumentOutOfRangeException">Thrown if `tokens` parameter is larger than TokenCount</exception>
+    public static void Rewind(this Conversation conversation, int tokens)
+    {
+        if (tokens > conversation.TokenCount)
+            throw new ArgumentOutOfRangeException(nameof(tokens), "Cannot rewind more than the total number of tokens");
+
+        conversation.Modify((end, kv) =>
+        {
+            // Remove those tokens from KV
+            kv.Remove(end.Value - tokens, tokens);
+
+            // Return adjusted end position
+            return end.Value - tokens;
+        });
+    }
+
+    /// <summary>
+    /// Shift all tokens over to the left, removing "count" tokens from the start and shifting everything over.
+    /// Leaves "keep" tokens at the start completely untouched. This can be used to free up space when the context
+    /// gets full, keeping the prompt at the start intact.
+    /// </summary>
+    /// <param name="conversation">The conversation to rewind</param>
+    /// <param name="count">How much to shift tokens over by</param>
+    /// <param name="keep">The number of tokens at the start which should <b>not</b> be shifted</param>
+    public static void ShiftLeft(this Conversation conversation, int count, int keep)
+    {
+        // Given a setup like this (shift=5, keep=3):
+        //
+        // AAABBBBBCCCCCCCCC...
+        // 
+        // We want to remove all the B's, shift all the C's and leave all the A's untouched
+
+        conversation.Modify((end, kv) =>
+        {
+            // Remove the B's
+            kv.Remove(keep, count);
+
+            // Shift the C's
+            kv.Shift(keep + count, end, -count);
+
+            // Update total count
+            return end.Value - count;
+        });
+    }
+}
\ No newline at end of file
diff --git a/LLama/Batched/Exceptions.cs b/LLama/Batched/Exceptions.cs
index 7d847bfe..1feb270c 100644
--- a/LLama/Batched/Exceptions.cs
+++ b/LLama/Batched/Exceptions.cs
@@ -57,7 +57,7 @@ public class CannotSampleRequiresPromptException
 }
 
 /// <summary>
-/// This exception is thrown when "Fork()" is called on a <see cref="Conversation"/> with <see cref="Conversation.RequiresInference"/> = true
+/// This exception is thrown when <see cref="Conversation.Fork"/> is called when <see cref="Conversation.RequiresInference"/> = true
 /// </summary>
 public class CannotForkWhileRequiresInference
     : ExperimentalBatchedExecutorException
@@ -69,13 +69,13 @@ public class CannotForkWhileRequiresInference
 }
 
 /// <summary>
-/// This exception is thrown when "Rewind()" is called on a <see cref="Conversation"/> with <see cref="Conversation.RequiresInference"/> = true
+/// This exception is thrown when <see cref="Conversation.Modify"/> is called when <see cref="Conversation.RequiresInference"/> = true
 /// </summary>
-public class CannotRewindWhileRequiresInference
+public class CannotModifyWhileRequiresInference
     : ExperimentalBatchedExecutorException
 {
-    internal CannotRewindWhileRequiresInference()
-        : base("Cannot `Rewind()` a conversation while RequiresInference is true")
+    internal CannotModifyWhileRequiresInference()
+        : base("Cannot `Modify()` a conversation while RequiresInference is true")
     {
     }
 }
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index c953cb23..578cad40 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -388,7 +388,7 @@ namespace LLama.Native
         /// <param name="p1"></param>
         /// <param name="delta"></param>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta);
+        public static extern void llama_kv_cache_seq_shift(SafeLLamaContextHandle ctx, LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta);
 
         /// <summary>
         /// Integer division of the positions by factor of `d > 1`
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 91e82c85..da53491d 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -369,7 +369,7 @@ namespace LLama.Native
         /// <param name="p0"></param>
         /// <param name="p1"></param>
         /// <param name="delta"></param>
-        public void KvCacheSequenceShift(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, LLamaPos delta)
+        public void KvCacheSequenceShift(LLamaSeqId seq, LLamaPos p0, LLamaPos p1, int delta)
         {
             NativeApi.llama_kv_cache_seq_shift(this, seq, p0, p1, delta);
         }

From 0c2cff0e1c5a0f2fe93d2e7e4f31054629f9eaf9 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Mon, 12 Feb 2024 02:58:35 +0000
Subject: [PATCH 3/5] Added a Finalizer for `Conversation` in case it is not
 correctly disposed.

---
 LLama/Batched/Conversation.cs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index a699bb66..b3ce97dd 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -54,6 +54,11 @@ public sealed class Conversation
         _end = end;
     }
 
+    ~Conversation()
+    {
+        Dispose();
+    }
+
     /// <summary>
     /// End this conversation, freeing all resources used by it
     /// </summary>
@@ -66,6 +71,9 @@ public sealed class Conversation
 
         // Remove this conversation from the KV cache
         Executor.Context.NativeHandle.KvCacheRemove(ConversationId, 0, _end);
+
+        // Prevent finalizer from running
+        GC.SuppressFinalize(this);
     }
 
     private void AssertNotDisposed()

From 1cc463b9b748abc7e4922e614bace3b2da079c8c Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Mon, 12 Feb 2024 15:34:52 +0000
Subject: [PATCH 4/5] Added a finalizer to `BatchedExecutor`

---
 LLama/Batched/BatchedExecutor.cs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
index 2009a687..f432a89f 100644
--- a/LLama/Batched/BatchedExecutor.cs
+++ b/LLama/Batched/BatchedExecutor.cs
@@ -15,6 +15,11 @@ public sealed class BatchedExecutor
     private int _nextSequenceId;
 
     internal LLamaBatch Batch { get; }
+
+    /// <summary>
+    /// Epoch is incremented every time Infer is called. Conversations can use this to keep track of
+    /// whether they're waiting for inference, or can be sampled.
+    /// </summary>
     internal ulong Epoch { get; private set; }
 
     /// <summary>
@@ -50,6 +55,11 @@ public sealed class BatchedExecutor
         Epoch = 1;
     }
 
+    ~BatchedExecutor()
+    {
+        Dispose();
+    }
+
     /// <summary>
     /// Start a new <see cref="Conversation"/> with the given prompt
     /// </summary>
@@ -97,6 +107,8 @@ public sealed class BatchedExecutor
             return;
         IsDisposed = true;
 
+        GC.SuppressFinalize(this);
+
         Context.Dispose();
     }
 

From e9d9042576a6a925dff29bac4382cc7e9fd09dca Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Mon, 12 Feb 2024 15:54:13 +0000
Subject: [PATCH 5/5] Added `Divide` to `KvAccessor`

---
 LLama/Batched/Conversation.cs          | 17 +++++++++++++++++
 LLama/Native/SafeLLamaContextHandle.cs |  6 +++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/LLama/Batched/Conversation.cs b/LLama/Batched/Conversation.cs
index b3ce97dd..6cf6e312 100644
--- a/LLama/Batched/Conversation.cs
+++ b/LLama/Batched/Conversation.cs
@@ -264,6 +264,23 @@ public sealed class Conversation
             _conversation.Executor.Context.NativeHandle.KvCacheSequenceShift(_conversation.ConversationId, start, end, delta);
         }
         #endregion
+
+        #region divide
+        /// <summary>
+        /// Integer division of the positions by factor of `d > 1`.
+        /// If the KV cache is RoPEd, the KV data is updated accordingly.
+        /// </summary>
+        /// <param name="start">Start position (inclusive). If less than zero, it is clamped to zero.</param>
+        /// <param name="end">End position (exclusive). If less than zero, it is treated as "infinity".</param>
+        /// <param name="divisor">Amount to divide each position by.</param>
+        public void Divide(LLamaPos start, LLamaPos end, int divisor)
+        {
+            if (divisor <= 0)
+                throw new ArgumentOutOfRangeException(nameof(divisor));
+
+            _conversation.Executor.Context.NativeHandle.KvCacheSequenceDivide(_conversation.ConversationId, start, end, divisor);
+        }
+        #endregion
     }
 
     /// <summary>
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index da53491d..2d9387ae 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -375,9 +375,9 @@ namespace LLama.Native
         }
 
         /// <summary>
-        /// Integer division of the positions by factor of `d > 1`
-        /// If the KV cache is RoPEd, the KV data is updated accordingly
-        /// p0 &lt; 0 : [0,  p1]
+        /// Integer division of the positions by factor of `d > 1`.
+        /// If the KV cache is RoPEd, the KV data is updated accordingly.<br />
+        /// p0 &lt; 0 : [0,  p1]<br />
         /// p1 &lt; 0 : [p0, inf)
         /// </summary>
         /// <param name="seq"></param>