diff --git a/docs/Examples/BatchedExecutorFork.md b/docs/Examples/BatchedExecutorFork.md
index ad391dd1..8ec4887b 100644
--- a/docs/Examples/BatchedExecutorFork.md
+++ b/docs/Examples/BatchedExecutorFork.md
@@ -1,148 +1,48 @@
-# Bacthed executor - multi-output to one input
-
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
-
-namespace LLama.Examples.Examples;
-
-///
-/// This demonstrates generating multiple replies to the same prompt, with a shared cache
-///
-public class BatchedExecutorFork
-{
- private const int n_split = 16;
- private const int n_len = 72;
-
- public static async Task Run()
- {
- string modelPath = UserSettings.GetModelPath();
-
- var parameters = new ModelParams(modelPath);
- using var model = LLamaWeights.LoadFromFile(parameters);
-
- var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
-
- // Create an executor that can evaluate a batch of conversations together
- using var executor = new BatchedExecutor(model, parameters);
-
- // Print some info
- var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
- Console.WriteLine($"Created executor with model: {name}");
-
- // Evaluate the initial prompt to create one conversation
- using var start = executor.Create();
- start.Prompt(prompt);
- await executor.Infer();
-
- // Create the root node of the tree
- var root = new Node(start);
-
- await AnsiConsole
- .Progress()
- .StartAsync(async progress =>
- {
- var reporter = progress.AddTask("Running Inference (1)", maxValue: n_len);
-
- // Run inference loop
- for (var i = 0; i < n_len; i++)
- {
- if (i != 0)
- await executor.Infer();
-
- // Occasionally fork all the active conversations
- if (i != 0 && i % n_split == 0)
- root.Split();
-
- // Sample all active conversations
- root.Sample();
-
- // Update progress bar
- reporter.Increment(1);
- reporter.Description($"Running Inference ({root.ActiveConversationCount})");
- }
-
- // Display results
- var display = new Tree(prompt);
- root.Display(display);
- AnsiConsole.Write(display);
- });
- }
-
- private class Node
- {
- private readonly StreamingTokenDecoder _decoder;
-
- private readonly DefaultSamplingPipeline _sampler;
- private Conversation? _conversation;
-
- private Node? _left;
- private Node? _right;
-
- public int ActiveConversationCount => _conversation != null ? 1 : _left!.ActiveConversationCount + _right!.ActiveConversationCount;
-
- public Node(Conversation conversation)
- {
- _sampler = new DefaultSamplingPipeline();
- _conversation = conversation;
- _decoder = new StreamingTokenDecoder(conversation.Executor.Context);
- }
-
- public void Sample()
- {
- if (_conversation == null)
- {
- _left?.Sample();
- _right?.Sample();
- return;
- }
-
- if (_conversation.RequiresInference)
- return;
-
- // Sample one token
- var ctx = _conversation.Executor.Context.NativeHandle;
- var token = _sampler.Sample(ctx, _conversation.Sample(), Array.Empty());
- _sampler.Accept(ctx, token);
- _decoder.Add(token);
-
- // Prompt the conversation with this token, to continue generating from there
- _conversation.Prompt(token);
- }
-
- public void Split()
- {
- if (_conversation != null)
- {
- _left = new Node(_conversation.Fork());
- _right = new Node(_conversation.Fork());
-
- _conversation.Dispose();
- _conversation = null;
- }
- else
- {
- _left?.Split();
- _right?.Split();
- }
- }
-
- public void Display(T tree, int depth = 0)
- where T : IHasTreeNodes
- {
- var colors = new[] { "red", "green", "blue", "yellow", "white" };
- var color = colors[depth % colors.Length];
-
- var message = Markup.Escape(_decoder.Read().ReplaceLineEndings(""));
-
- var n = tree.AddNode($"[{color}]{message}[/]");
-
- _left?.Display(n, depth + 1);
- _right?.Display(n, depth + 1);
- }
- }
-}
-```
\ No newline at end of file
+# BatchedExecutor Fork - Generate Multiple Completions With Shared Memory
+
+This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorFork.cs).
+
+Sequences share memory up to the point they were split, meaning no extra memory is consumed by creating a fork. Inference runs for all sequences simultaneously, this means that running two sequences does _not_ take twice as much time as running one.
+
+An example output, starting with the prompt `Not many people know that`:
+
+```
+Not many people know that
+└── , in the 17th century, a military band led by Captain Charles
+ ├── Bossler of Baden, Germany, composed and played a music suite titled
+ │ ├── the "Civil Psalm," in order to rally German Protestants during
+ │ │ ├── the Thirty Years' War. This tune became popular among German soldiers,
+ │ │ │ ├── and its popularity continued long after the war
+ │ │ │ └── and, eventually, reached France. The
+ │ │ └── the Thirty Years' War.This music, with its clear call
+ │ │ ├── to arms and strong Christian themes, helped
+ │ │ └── to arms and unwavering belief
+ │ └── "Baden's First National Symphony," with lyrics by a young Wol
+ │ ├── fgang Amadeus Mozart. The story of the composition's creation
+ │ │ ├── has long been forgotten. But the B
+ │ │ └── was popularized by a novelty book
+ │ └── fgang Amadeus Mozart. It's said that this music brought
+ │ ├── peace to Europe, at least for a
+ │ └── the troops together during difficult times. It
+ └── Newdick played a mournful dirge to accompany the procession of
+ ├── the head of King Charles I. It is the scene that opens my latest book
+ │ ├── , "Death and Taxes." The book follows a British army captain named
+ │ │ ├── Marcus as he seeks revenge for his wife
+ │ │ └── William Darnay who becomes involved in
+ │ └── , A King, A Pawn and a Prince. The murder of the king
+ │ ├── and the civil war that followed are the
+ │ └── is a watershed moment in the political
+ └── the coffin of William Shakespeare, as it was carried to its final resting place
+ ├── . That is the least that can be said for a man who is often regarded
+ │ ├── as the greatest writer in the English language
+ │ └── as the greatest writer the English language has
+ └── at Stratford-upon-Avon. Shakespeare, of course
+ ├── , was a famous English poet and play
+ └── , was one of the greatest playwright
+```
+
+Forked sequences can be used for many possible things. For example
+ - Evaluating the system prompt once and forking for each independent conversation.
+ - Saving a "checkpoint" in a conversation to return to later.
+ - Beam Search.
+ - Splitting a conversation, generating completions from several different "agents", and taking the best response.
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorGuidance.md b/docs/Examples/BatchedExecutorGuidance.md
index 94d0ef86..99912ae4 100644
--- a/docs/Examples/BatchedExecutorGuidance.md
+++ b/docs/Examples/BatchedExecutorGuidance.md
@@ -1,130 +1,7 @@
-# Batched executor - basic guidance
+# BatchedExecutor Guidance - Classifier Free Guidance / Negative Prompting
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
+This example demonstrates using `Classifier Free Guidance` (a.k.a. negative prompting) with a custom sampling pipeline. Negative prompting is a way of steering the model output away from certain topics. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorGuidance.cs).
-namespace LLama.Examples.Examples;
+Two conversations are created. The `guided` conversation starts with the prompt that should be completed as shown as the output, for example `"my favourite colour is"`. The `guidance` conversation contains the negative prompt at the start, for example `"I hate the colour red. My favourite colour is"`. Note that this is a _negative_ prompt, so therefore this guidance will make the model answer as if it _likes_ the colour red.
-///
-/// This demonstrates using a batch to generate two sequences and then using one
-/// sequence as the negative guidance ("classifier free guidance") for the other.
-///
-public class BatchedExecutorGuidance
-{
- private const int n_len = 32;
-
- public static async Task Run()
- {
- string modelPath = UserSettings.GetModelPath();
-
- var parameters = new ModelParams(modelPath);
- using var model = LLamaWeights.LoadFromFile(parameters);
-
- var positivePrompt = AnsiConsole.Ask("Positive Prompt (or ENTER for default):", "My favourite colour is").Trim();
- var negativePrompt = AnsiConsole.Ask("Negative Prompt (or ENTER for default):", "I hate the colour red. My favourite colour is").Trim();
- var weight = AnsiConsole.Ask("Guidance Weight (or ENTER for default):", 2.0f);
-
- // Create an executor that can evaluate a batch of conversations together
- using var executor = new BatchedExecutor(model, parameters);
-
- // Print some info
- var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
- Console.WriteLine($"Created executor with model: {name}");
-
- // Load the two prompts into two conversations
- using var guided = executor.Create();
- guided.Prompt(positivePrompt);
- using var guidance = executor.Create();
- guidance.Prompt(negativePrompt);
-
- // Run inference to evaluate prompts
- await AnsiConsole
- .Status()
- .Spinner(Spinner.Known.Line)
- .StartAsync("Evaluating Prompts...", _ => executor.Infer());
-
- // Fork the "guided" conversation. We'll run this one without guidance for comparison
- using var unguided = guided.Fork();
-
- // Run inference loop
- var unguidedSampler = new GuidedSampler(null, weight);
- var unguidedDecoder = new StreamingTokenDecoder(executor.Context);
- var guidedSampler = new GuidedSampler(guidance, weight);
- var guidedDecoder = new StreamingTokenDecoder(executor.Context);
- await AnsiConsole
- .Progress()
- .StartAsync(async progress =>
- {
- var reporter = progress.AddTask("Running Inference", maxValue: n_len);
-
- for (var i = 0; i < n_len; i++)
- {
- if (i != 0)
- await executor.Infer();
-
- // Sample from the "unguided" conversation. This is just a conversation using the same prompt, without any
- // guidance. This serves as a comparison to show the effect of guidance.
- var u = unguidedSampler.Sample(executor.Context.NativeHandle, unguided.Sample(), Array.Empty());
- unguidedDecoder.Add(u);
- unguided.Prompt(u);
-
- // Sample from the "guided" conversation. This sampler will internally use the "guidance" conversation
- // to steer the conversation. See how this is done in GuidedSampler.ProcessLogits (bottom of this file).
- var g = guidedSampler.Sample(executor.Context.NativeHandle, guided.Sample(), Array.Empty());
- guidedDecoder.Add(g);
-
- // Use this token to advance both guided _and_ guidance. Keeping them in sync (except for the initial prompt).
- guided.Prompt(g);
- guidance.Prompt(g);
-
- // Early exit if we reach the natural end of the guided sentence
- if (g == model.EndOfSentenceToken)
- break;
-
- // Update progress bar
- reporter.Increment(1);
- }
- });
-
- AnsiConsole.MarkupLine($"[green]Unguided:[/][white]{unguidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
- AnsiConsole.MarkupLine($"[green]Guided:[/][white]{guidedDecoder.Read().ReplaceLineEndings(" ")}[/]");
- }
-
- private class GuidedSampler(Conversation? guidance, float weight)
- : BaseSamplingPipeline
- {
- public override void Accept(SafeLLamaContextHandle ctx, LLamaToken token)
- {
- }
-
- public override ISamplingPipeline Clone()
- {
- throw new NotSupportedException();
- }
-
- protected override void ProcessLogits(SafeLLamaContextHandle ctx, Span logits, ReadOnlySpan lastTokens)
- {
- if (guidance == null)
- return;
-
- // Get the logits generated by the guidance sequences
- var guidanceLogits = guidance.Sample();
-
- // Use those logits to guide this sequence
- NativeApi.llama_sample_apply_guidance(ctx, logits, guidanceLogits, weight);
- }
-
- protected override LLamaToken ProcessTokenDataArray(SafeLLamaContextHandle ctx, LLamaTokenDataArray candidates, ReadOnlySpan lastTokens)
- {
- candidates.Temperature(ctx, 0.8f);
- candidates.TopK(ctx, 25);
-
- return candidates.SampleToken(ctx);
- }
- }
-}
-```
\ No newline at end of file
+A custom sampler samples the `guidance` conversation and uses that output to influence the output of the `guided` conversation. Once a token is selected _both_ conversations are continued with this token.
\ No newline at end of file
diff --git a/docs/Examples/BatchedExecutorRewind.md b/docs/Examples/BatchedExecutorRewind.md
index 06287b7c..78c480c7 100644
--- a/docs/Examples/BatchedExecutorRewind.md
+++ b/docs/Examples/BatchedExecutorRewind.md
@@ -1,121 +1,5 @@
-# Batched executor - rewinding to an earlier state
+# BatchedExecutor - Rewind
-```cs
-using LLama.Batched;
-using LLama.Common;
-using LLama.Native;
-using LLama.Sampling;
-using Spectre.Console;
+This example demonstrates using the `BatchedExecutor` to split one sequence into multiple sequences. See the source code [here](https://github.com/SciSharp/LLamaSharp/blob/master/LLama.Examples/Examples/BatchedExecutorRewind.cs).
-namespace LLama.Examples.Examples;
-
-///
-/// This demonstrates generating tokens and then rewinding to an earlier state
-///
-public class BatchedExecutorRewind
-{
- private const int n_generate = 24;
- private const int n_rewind = 12;
- private const int n_repeats = 6;
-
- public static async Task Run()
- {
- string modelPath = UserSettings.GetModelPath();
-
- var parameters = new ModelParams(modelPath);
- using var model = LLamaWeights.LoadFromFile(parameters);
-
- var prompt = AnsiConsole.Ask("Prompt (or ENTER for default):", "Not many people know that");
-
- // Create an executor that can evaluate a batch of conversations together
- using var executor = new BatchedExecutor(model, parameters);
-
- // Print some info
- var name = executor.Model.Metadata.GetValueOrDefault("general.name", "unknown model name");
- Console.WriteLine($"Created executor with model: {name}");
-
- // Evaluate the initial prompt to create one conversation
- using var conversation = executor.Create();
- conversation.Prompt(prompt);
-
- // Create the start node wrapping the conversation
- var node = new Node(executor.Context);
-
- // Print the prompt
- Console.ForegroundColor = ConsoleColor.Green;
- Console.WriteLine(prompt);
-
- for (var i = 0; i < n_repeats; i++)
- {
- for (var j = 0; j < n_generate; j++)
- {
- // Run inference
- await executor.Infer();
-
- // Sample a token
- var token = node.Sample(conversation);
-
- // Continue conversation with this token
- if (j != n_generate - 1)
- conversation.Prompt(token);
- }
-
- // Write out what we generated
- node.Write(n_rewind, i + 1);
-
- // Rewind back a few tokens
- conversation.Rewind(n_rewind + 1);
-
- // Prompt with a token
- conversation.Prompt(node.GetToken(n_generate - n_rewind - 1));
-
- // Create a new node around the rewound conversation
- node = new Node(executor.Context);
- }
-
- Console.WriteLine("Press any key to exit demo");
- Console.ReadKey(true);
- }
-
- private class Node
- {
- private readonly LLamaContext _context;
-
- private readonly List _tokens = new List();
- private readonly DefaultSamplingPipeline Sampler;
-
- public Node(LLamaContext context)
- {
- _context = context;
- Sampler = new DefaultSamplingPipeline();
- }
-
- public LLamaToken Sample(Conversation conversation)
- {
- var token = Sampler.Sample(_context.NativeHandle, conversation.Sample(), Array.Empty());
- _tokens.Add(token);
- return token;
- }
-
- public void Write(int n_rewind, int depth)
- {
- var decoder = new StreamingTokenDecoder(_context);
-
- for (var i = 0; i < _tokens.Count - n_rewind; i++)
- decoder.Add(_tokens[i]);
-
- AnsiConsole.MarkupLine($"[green]{new string(' ', depth * 3) + decoder.Read().ReplaceLineEndings(" ")}[/]");
-
- for (var i = _tokens.Count - n_rewind; i < _tokens.Count; i++)
- decoder.Add(_tokens[i]);
-
- AnsiConsole.MarkupLine($"[maroon]{decoder.Read().ReplaceLineEndings(" ")}[/]");
- }
-
- public LLamaToken GetToken(int index)
- {
- return _tokens[index];
- }
- }
-}
-```
\ No newline at end of file
+A single conversation is prompted and then continued for 24 tokens, after that it is re-wound by 12 tokens and continued from there. Rewinding simply sets the conversation back to an earlier state and requires no extra computation.
\ No newline at end of file