From 6a7e74e71b15ce585aea06a35674da01ff81d84c Mon Sep 17 00:00:00 2001
From: Yaohui Liu <AsakusaRinne@gmail.com>
Date: Sat, 4 Nov 2023 22:38:06 +0800
Subject: [PATCH 01/12] build: add package for kernel-memory integration.

---
 .../LLamaSharp.KernelMemory.csproj            | 23 ++++++++++++++++++-
 .../LLamaSharp.SemanticKernel.csproj          |  6 ++---
 README.md                                     |  9 +++++++-
 3 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
index 54766b02..de5f42a5 100644
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -1,9 +1,30 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFramework>net6.0</TargetFramework>
+    <TargetFrameworks>netstandard2.0;net6.0;net7.0</TargetFrameworks>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
+
+    <Version>0.7.1</Version>
+    <Authors>Xbotter</Authors>
+    <Company>SciSharp STACK</Company>
+    <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
+    <Copyright>MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy))</Copyright>
+    <RepositoryUrl>https://github.com/SciSharp/LLamaSharp</RepositoryUrl>
+    <RepositoryType>git</RepositoryType>
+    <PackageIconUrl>https://avatars3.githubusercontent.com/u/44989469?s=200&amp;v=4</PackageIconUrl>
+    <PackageTags>LLama, LLM, GPT, ChatGPT, kernel-memory, vector search, SciSharp</PackageTags>
+    <Description>
+      The integration of LLamaSharp and Microsoft kernel-memory. It could make it easy to support document search for LLamaSharp model inference.
+    </Description>
+    <PackageReleaseNotes>
+      Support integration with kernel-memory
+    </PackageReleaseNotes>
+    <PackageLicenseExpression>MIT</PackageLicenseExpression>
+    <PackageOutputPath>packages</PackageOutputPath>
+    <Platforms>AnyCPU;x64;Arm64</Platforms>
+    <PackageId>LLamaSharp.kernel-memory</PackageId>
+    <Configurations>Debug;Release;GPU</Configurations>
   </PropertyGroup>
 
   <ItemGroup>
diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
index 77596d57..c6ece4e7 100644
--- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
+++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj
@@ -10,8 +10,8 @@
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>
 
-    <Version>0.6.2-beta1</Version>
-    <Authors>Tim Miller</Authors>
+    <Version>0.7.1</Version>
+    <Authors>Tim Miller, Xbotter</Authors>
     <Company>SciSharp STACK</Company>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
     <Copyright>MIT, SciSharp STACK $([System.DateTime]::UtcNow.ToString(yyyy))</Copyright>
@@ -20,7 +20,7 @@
     <PackageIconUrl>https://avatars3.githubusercontent.com/u/44989469?s=200&amp;v=4</PackageIconUrl>
     <PackageTags>LLama, LLM, GPT, ChatGPT, semantic-kernel, SciSharp</PackageTags>
     <Description>
-      The integration of LLamaSharp ans semantic-kernel.
+      The integration of LLamaSharp and Microsoft semantic-kernel.
     </Description>
     <PackageReleaseNotes>
       Support integration with semantic-kernel
diff --git a/README.md b/README.md
index c3b73f8c..d116d1a4 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,13 @@ For [microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) in
 LLamaSharp.semantic-kernel
 ```
 
+For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package:
+
+```
+LLamaSharp.kernel-memory
+```
+
+
 ### Tips for choosing a version
 
 In general, there may be some break changes between two minor releases, for example 0.5.1 and 0.6.0. On the contrary, we don't introduce API break changes in patch release. Therefore it's recommended to keep the highest patch version of a minor release. For example, keep 0.5.6 instead of 0.5.3.
@@ -196,7 +203,7 @@ Another choice is generate gguf format file yourself with a pytorch weight (or a
 
 🔳 Fine-tune
 
-⚠️ Local document search (enabled by kernel-memory now)
+✅ Local document search (enabled by kernel-memory now)
 
 🔳 MAUI Integration
 

From 0f12566f654f430f50480128ae65e5f588f6dc45 Mon Sep 17 00:00:00 2001
From: Yaohui Liu <AsakusaRinne@gmail.com>
Date: Sun, 5 Nov 2023 02:55:41 +0800
Subject: [PATCH 02/12] build: use only net6.0 with kernel-memory.

---
 LLama.KernelMemory/LLamaSharp.KernelMemory.csproj | 2 +-
 README.md                                         | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
index de5f42a5..7fd99e2c 100644
--- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
+++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -1,7 +1,7 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>netstandard2.0;net6.0;net7.0</TargetFrameworks>
+    <TargetFramework>net6.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
 
diff --git a/README.md b/README.md
index d116d1a4..96c9883a 100644
--- a/README.md
+++ b/README.md
@@ -54,13 +54,12 @@ For [microsoft semantic-kernel](https://github.com/microsoft/semantic-kernel) in
 LLamaSharp.semantic-kernel
 ```
 
-For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package:
+For [microsoft kernel-memory](https://github.com/microsoft/kernel-memory) integration, please search and install the following package (currently kernel-memory only supports net6.0):
 
 ```
 LLamaSharp.kernel-memory
 ```
 
-
 ### Tips for choosing a version
 
 In general, there may be some break changes between two minor releases, for example 0.5.1 and 0.6.0. On the contrary, we don't introduce API break changes in patch release. Therefore it's recommended to keep the highest patch version of a minor release. For example, keep 0.5.6 instead of 0.5.3.

From 457958435b07603e9f41dad62af2ca60e621bae1 Mon Sep 17 00:00:00 2001
From: Yaohui Liu <AsakusaRinne@gmail.com>
Date: Sun, 5 Nov 2023 02:59:41 +0800
Subject: [PATCH 03/12] build: use semantic-kernel beta1 in examples.

---
 LLama.Examples/LLama.Examples.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj
index 3ecacdfe..c1761829 100644
--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@@ -29,7 +29,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="7.0.0" />
-    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta4" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta1" />
   </ItemGroup>
 
   <ItemGroup>

From 04ee64a6be28d9cace256bcc7def02890d2bfe38 Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Mon, 6 Nov 2023 21:59:18 +0000
Subject: [PATCH 04/12] Exposed YaRN scaling parameters in IContextParams

---
 LLama.Web/Common/ModelOptions.cs             | 19 ++++++++++++
 LLama/Abstractions/IContextParams.cs         | 31 ++++++++++++++++++++
 LLama/Common/ModelParams.cs                  | 28 +++++++++++++++---
 LLama/Extensions/IContextParamsExtensions.cs |  9 ++++++
 LLama/Native/LLamaContextParams.cs           |  4 +--
 LLama/Native/RopeScalingType.cs              | 17 +++++++++++
 6 files changed, 102 insertions(+), 6 deletions(-)
 create mode 100644 LLama/Native/RopeScalingType.cs

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 6a63ccc3..182ace00 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -1,5 +1,6 @@
 ﻿using System.Text;
 using LLama.Abstractions;
+using LLama.Native;
 
 namespace LLama.Web.Common
 {
@@ -118,6 +119,24 @@ namespace LLama.Web.Common
         /// </summary>
         public float? RopeFrequencyScale { get; set; }
 
+        /// <inheritdoc />
+        public float? YarnExtrapolationFactor { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnAttentionFactor { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnBetaFast { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnBetaSlow { get; set; }
+
+        /// <inheritdoc />
+        public uint? YarnOriginalContext { get; set; }
+
+        /// <inheritdoc />
+        public RopeScalingType? YarnScalingType { get; set; }
+
         /// <summary>
         /// Use experimental mul_mat_q kernels
         /// </summary>
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
index 8ff6d7cc..0f129217 100644
--- a/LLama/Abstractions/IContextParams.cs
+++ b/LLama/Abstractions/IContextParams.cs
@@ -1,4 +1,5 @@
 ﻿using System.Text;
+using LLama.Native;
 
 namespace LLama.Abstractions;
 
@@ -67,4 +68,34 @@ public interface IContextParams
     /// Number of threads to use for batch processing (null = autodetect) (n_threads)
     /// </summary>
     uint? BatchThreads { get; set; }
+
+    /// <summary>
+    /// YaRN extrapolation mix factor
+    /// </summary>
+    float? YarnExtrapolationFactor { get; set; }
+
+    /// <summary>
+    /// YaRN magnitude scaling factor
+    /// </summary>
+    float? YarnAttentionFactor { get; set; }
+
+    /// <summary>
+    /// YaRN low correction dim
+    /// </summary>
+    float? YarnBetaFast { get; set; }
+
+    /// <summary>
+    /// YaRN high correction dim
+    /// </summary>
+    float? YarnBetaSlow { get; set; }
+
+    /// <summary>
+    /// YaRN original context length
+    /// </summary>
+    uint? YarnOriginalContext { get; set; }
+
+    /// <summary>
+    /// YaRN scaling method to use.
+    /// </summary>
+    RopeScalingType? YarnScalingType { get; set; }
 }
\ No newline at end of file
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index ee5bd3e4..dd4584e3 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -3,6 +3,7 @@ using System;
 using System.Text;
 using System.Text.Json;
 using System.Text.Json.Serialization;
+using LLama.Native;
 
 namespace LLama.Common
 {
@@ -70,6 +71,7 @@ namespace LLama.Common
         /// </summary>
         public uint? BatchThreads { get; set; }
 
+
         /// <summary>
         /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)
         /// </summary>
@@ -98,10 +100,28 @@ namespace LLama.Common
 		/// </summary>
 		public float? RopeFrequencyScale { get; set; }
 
-		/// <summary>
-		/// Use experimental mul_mat_q kernels
-		/// </summary>
-		public bool MulMatQ { get; set; }
+        /// <inheritdoc />
+        public float? YarnExtrapolationFactor { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnAttentionFactor { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnBetaFast { get; set; }
+
+        /// <inheritdoc />
+        public float? YarnBetaSlow { get; set; }
+
+        /// <inheritdoc />
+        public uint? YarnOriginalContext { get; set; }
+
+        /// <inheritdoc />
+        public RopeScalingType? YarnScalingType { get; set; }
+
+        /// <summary>
+        /// Use experimental mul_mat_q kernels
+        /// </summary>
+        public bool MulMatQ { get; set; }
 
         /// <summary>
         /// Load vocab only (no weights)
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
index fcc9d372..16716b53 100644
--- a/LLama/Extensions/IContextParamsExtensions.cs
+++ b/LLama/Extensions/IContextParamsExtensions.cs
@@ -29,6 +29,15 @@ namespace LLama.Extensions
             result.embedding = @params.EmbeddingMode;
             result.rope_freq_base = @params.RopeFrequencyBase ?? 0;
             result.rope_freq_scale = @params.RopeFrequencyScale ?? 0;
+
+            // Default YaRN values copied from here: https://github.com/ggerganov/llama.cpp/blob/381efbf480959bb6d1e247a8b0c2328f22e350f8/common/common.h#L67
+            result.yarn_ext_factor = @params.YarnExtrapolationFactor ?? -1f;
+            result.yarn_attn_factor = @params.YarnAttentionFactor ?? 1f;
+            result.yarn_beta_fast = @params.YarnBetaFast ?? 32f;
+            result.yarn_beta_slow = @params.YarnBetaSlow ?? 1f;
+            result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
+            result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.LLAMA_ROPE_SCALING_UNSPECIFIED;
+
             result.mul_mat_q = @params.MulMatQ;
 
             result.n_threads = Threads(@params.Threads);
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
index 0a397a3d..f1ba569d 100644
--- a/LLama/Native/LLamaContextParams.cs
+++ b/LLama/Native/LLamaContextParams.cs
@@ -44,13 +44,13 @@ namespace LLama.Native
         /// <summary>
         /// RoPE scaling type, from `enum llama_rope_scaling_type` 
         /// </summary>
-        public sbyte   rope_scaling_type;        
+        public RopeScalingType rope_scaling_type;        
         
 
         /// <summary>
         /// RoPE base frequency, 0 = from model
         /// </summary>
-        public float    rope_freq_base;   
+        public float    rope_freq_base;
         /// <summary>
         /// RoPE frequency scaling factor, 0 = from model
         /// </summary>
diff --git a/LLama/Native/RopeScalingType.cs b/LLama/Native/RopeScalingType.cs
new file mode 100644
index 00000000..435932e8
--- /dev/null
+++ b/LLama/Native/RopeScalingType.cs
@@ -0,0 +1,17 @@
+﻿namespace LLama.Native
+{
+    /// <summary>
+    /// RoPE scaling type. C# equivalent of llama_rope_scaling_type
+    /// </summary>
+    public enum RopeScalingType
+        : sbyte
+    {
+        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+
+        LLAMA_ROPE_SCALING_NONE = 0,
+
+        LLAMA_ROPE_SCALING_LINEAR = 1,
+
+        LLAMA_ROPE_SCALING_YARN = 2,
+    }
+}

From a288e7c02bf0cc7b74aaca989e6dbd3913040db5 Mon Sep 17 00:00:00 2001
From: Philipp Bauer <hello@philippbauer.org>
Date: Mon, 6 Nov 2023 18:20:07 -0600
Subject: [PATCH 05/12] Prevent duplication of user prompts / chat history in
 ChatSession.

The way ChatSession.ChatAsync was using the provided methods
from a IHistoryTransform interface implementation created unexpected
duplication of the chat history messages. It also prevented loading
previous history into the current session.
---
 LLama/ChatSession.cs | 50 ++++++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
index 457e7e48..358d70c3 100644
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -95,11 +95,11 @@ namespace LLama
                 Directory.CreateDirectory(path);
             }
             _executor.Context.SaveState(Path.Combine(path, _modelStateFilename));
-            if(Executor is StatelessExecutor)
+            if (Executor is StatelessExecutor)
             {
 
             }
-            else if(Executor is StatefulExecutorBase statefulExecutor)
+            else if (Executor is StatefulExecutorBase statefulExecutor)
             {
                 statefulExecutor.SaveState(Path.Combine(path, _executorStateFilename));
             }
@@ -135,30 +135,54 @@ namespace LLama
         }
 
         /// <summary>
-        /// Get the response from the LLama model. Note that prompt could not only be the preset words, 
-        /// but also the question you want to ask.
+        /// Generates a response for a given user prompt and manages history state for the user.
+        /// This will always pass the whole history to the model. Don't pass a whole history
+        /// to this method as the user prompt will be appended to the history of the current session.
+        /// If more control is needed, use the other overload of this method that accepts a ChatHistory object.
         /// </summary>
         /// <param name="prompt"></param>
         /// <param name="inferenceParams"></param>
         /// <param name="cancellationToken"></param>
-        /// <returns></returns>
+        /// <returns>Returns generated tokens of the assistant message.</returns>
         public async IAsyncEnumerable<string> ChatAsync(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
         {
-            foreach(var inputTransform in InputTransformPipeline)
+            foreach (var inputTransform in InputTransformPipeline)
                 prompt = inputTransform.Transform(prompt);
-            
-            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages);
+
+            History.Messages.Add(new ChatHistory.Message(AuthorRole.User, prompt));
+
+            string internalPrompt = HistoryTransform.HistoryToText(History);
+
             StringBuilder sb = new();
-            await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
+
+            await foreach (var result in ChatAsyncInternal(internalPrompt, inferenceParams, cancellationToken))
             {
                 yield return result;
                 sb.Append(result);
             }
-            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages);
+
+            string assistantMessage = sb.ToString();
+
+            // Remove end tokens from the assistant message
+            // if defined in inferenceParams.AntiPrompts.
+            // We only want the response that was generated and not tokens
+            // that are delimiting the beginning or end of the response.
+            if (inferenceParams?.AntiPrompts != null)
+            {
+                foreach (var stopToken in inferenceParams.AntiPrompts)
+                {
+                    assistantMessage = assistantMessage.Replace(stopToken, "");
+                }
+            }
+
+            History.Messages.Add(new ChatHistory.Message(AuthorRole.Assistant, assistantMessage));
         }
 
         /// <summary>
-        /// Get the response from the LLama model with chat histories.
+        /// Generates a response for a given chat history. This method does not manage history state for the user.
+        /// If you want to e.g. truncate the history of a session to fit into the model's context window,
+        /// use this method and pass the truncated history to it. If you don't need this control, use the other
+        /// overload of this method that accepts a user prompt instead.
         /// </summary>
         /// <param name="history"></param>
         /// <param name="inferenceParams"></param>
@@ -167,14 +191,14 @@ namespace LLama
         public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
         {
             var prompt = HistoryTransform.HistoryToText(history);
-            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.User, prompt).Messages);
+
             StringBuilder sb = new();
+
             await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
             {
                 yield return result;
                 sb.Append(result);
             }
-            History.Messages.AddRange(HistoryTransform.TextToHistory(AuthorRole.Assistant, sb.ToString()).Messages);
         }
 
         private async IAsyncEnumerable<string> ChatAsyncInternal(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)

From 6ea40d15461a5243bfdd453bd6d2ede3cdaa5eaa Mon Sep 17 00:00:00 2001
From: Philipp Bauer <hello@philippbauer.org>
Date: Wed, 8 Nov 2023 13:18:32 -0600
Subject: [PATCH 06/12] Use full history only when the ChatSession runs the
 first time

---
 LLama/ChatSession.cs | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
index 358d70c3..68c3c093 100644
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -1,11 +1,14 @@
 ﻿using LLama.Abstractions;
 using LLama.Common;
+using System;
 using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using System.Runtime.CompilerServices;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
+using static LLama.InteractiveExecutor;
 
 namespace LLama
 {
@@ -151,11 +154,17 @@ namespace LLama
 
             History.Messages.Add(new ChatHistory.Message(AuthorRole.User, prompt));
 
-            string internalPrompt = HistoryTransform.HistoryToText(History);
+            if (_executor is InteractiveExecutor executor)
+            {
+                InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData();
+                prompt = state.IsPromptRun
+                    ? HistoryTransform.HistoryToText(History)
+                    : prompt;
+            }
 
             StringBuilder sb = new();
 
-            await foreach (var result in ChatAsyncInternal(internalPrompt, inferenceParams, cancellationToken))
+            await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
             {
                 yield return result;
                 sb.Append(result);
@@ -190,14 +199,28 @@ namespace LLama
         /// <returns></returns>
         public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
         {
-            var prompt = HistoryTransform.HistoryToText(history);
+            if (history.Messages.Count == 0)
+            {
+                throw new ArgumentException("History must contain at least one message.");
+            }
 
-            StringBuilder sb = new();
+            string prompt;
+            if (_executor is InteractiveExecutor executor)
+            {
+                InteractiveExecutorState state = (InteractiveExecutorState)executor.GetStateData();
+
+                prompt = state.IsPromptRun
+                    ? HistoryTransform.HistoryToText(History)
+                    : history.Messages.Last().Content;
+            }
+            else
+            {
+                prompt = history.Messages.Last().Content;
+            }
 
             await foreach (var result in ChatAsyncInternal(prompt, inferenceParams, cancellationToken))
             {
                 yield return result;
-                sb.Append(result);
             }
         }
 

From d2b544afb8225600ff9b4d07112315d8eddbffd7 Mon Sep 17 00:00:00 2001
From: Philipp Bauer <hello@philippbauer.org>
Date: Wed, 8 Nov 2023 13:23:21 -0600
Subject: [PATCH 07/12] Improved method return description

---
 LLama/ChatSession.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LLama/ChatSession.cs b/LLama/ChatSession.cs
index 68c3c093..7ee99590 100644
--- a/LLama/ChatSession.cs
+++ b/LLama/ChatSession.cs
@@ -146,7 +146,7 @@ namespace LLama
         /// <param name="prompt"></param>
         /// <param name="inferenceParams"></param>
         /// <param name="cancellationToken"></param>
-        /// <returns>Returns generated tokens of the assistant message.</returns>
+        /// <returns>Returns generated text of the assistant message.</returns>
         public async IAsyncEnumerable<string> ChatAsync(string prompt, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
         {
             foreach (var inputTransform in InputTransformPipeline)
@@ -196,7 +196,7 @@ namespace LLama
         /// <param name="history"></param>
         /// <param name="inferenceParams"></param>
         /// <param name="cancellationToken"></param>
-        /// <returns></returns>
+        /// <returns>Returns generated text of the assistant message.</returns>
         public async IAsyncEnumerable<string> ChatAsync(ChatHistory history, IInferenceParams? inferenceParams = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
         {
             if (history.Messages.Count == 0)

From 1b4659dff955913be11b6084ce36578a9101c8f8 Mon Sep 17 00:00:00 2001
From: Chirag Karia <trichiragkaria@gmail.com>
Date: Sat, 11 Nov 2023 00:43:11 -0500
Subject: [PATCH 08/12] Update ToLLamaSharpChatHistory extension method to be
 public and support semantic-kernel author roles

---
 LLama.SemanticKernel/ExtensionMethods.cs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/LLama.SemanticKernel/ExtensionMethods.cs b/LLama.SemanticKernel/ExtensionMethods.cs
index 6a48eab0..0bb25411 100644
--- a/LLama.SemanticKernel/ExtensionMethods.cs
+++ b/LLama.SemanticKernel/ExtensionMethods.cs
@@ -3,9 +3,9 @@ using Microsoft.SemanticKernel.AI.ChatCompletion;
 
 namespace LLamaSharp.SemanticKernel;
 
-internal static class ExtensionMethods
+public static class ExtensionMethods
 {
-    internal static global::LLama.Common.ChatHistory ToLLamaSharpChatHistory(this ChatHistory chatHistory)
+    public static global::LLama.Common.ChatHistory ToLLamaSharpChatHistory(this ChatHistory chatHistory)
     {
         if (chatHistory is null)
         {
@@ -16,7 +16,7 @@ internal static class ExtensionMethods
 
         foreach (var chat in chatHistory)
         {
-            var role = Enum.TryParse<global::LLama.Common.AuthorRole>(chat.Role.Label, out var _role) ? _role : global::LLama.Common.AuthorRole.Unknown;
+            var role = Enum.TryParse<global::LLama.Common.AuthorRole>(chat.Role.Label, true, out var _role) ? _role : global::LLama.Common.AuthorRole.Unknown;
             history.AddMessage(role, chat.Content);
         }
 

From aa5e1ad54176f468aa157766d78b8446bbae5bac Mon Sep 17 00:00:00 2001
From: Chirag Karia <trichiragkaria@gmail.com>
Date: Sat, 11 Nov 2023 02:59:57 -0500
Subject: [PATCH 09/12] Add ignoreCase parameter to ToLLamaSharpChatHistory
 extension method

---
 LLama.SemanticKernel/ExtensionMethods.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LLama.SemanticKernel/ExtensionMethods.cs b/LLama.SemanticKernel/ExtensionMethods.cs
index 0bb25411..b3ff6a7b 100644
--- a/LLama.SemanticKernel/ExtensionMethods.cs
+++ b/LLama.SemanticKernel/ExtensionMethods.cs
@@ -5,7 +5,7 @@ namespace LLamaSharp.SemanticKernel;
 
 public static class ExtensionMethods
 {
-    public static global::LLama.Common.ChatHistory ToLLamaSharpChatHistory(this ChatHistory chatHistory)
+    public static global::LLama.Common.ChatHistory ToLLamaSharpChatHistory(this ChatHistory chatHistory, bool ignoreCase = true)
     {
         if (chatHistory is null)
         {
@@ -16,7 +16,7 @@ public static class ExtensionMethods
 
         foreach (var chat in chatHistory)
         {
-            var role = Enum.TryParse<global::LLama.Common.AuthorRole>(chat.Role.Label, true, out var _role) ? _role : global::LLama.Common.AuthorRole.Unknown;
+            var role = Enum.TryParse<global::LLama.Common.AuthorRole>(chat.Role.Label, ignoreCase, out var _role) ? _role : global::LLama.Common.AuthorRole.Unknown;
             history.AddMessage(role, chat.Content);
         }
 

From 6de8d6219a78d545988bc94831bad60a79b7b3a8 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Sat, 11 Nov 2023 09:05:17 +0100
Subject: [PATCH 10/12] Change SemanticKernel version to beta1 on Examples

---
 LLama.Examples/LLama.Examples.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LLama.Examples/LLama.Examples.csproj b/LLama.Examples/LLama.Examples.csproj
index 3ecacdfe..c1761829 100644
--- a/LLama.Examples/LLama.Examples.csproj
+++ b/LLama.Examples/LLama.Examples.csproj
@@ -29,7 +29,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="7.0.0" />
-    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta4" />
+    <PackageReference Include="Microsoft.SemanticKernel" Version="1.0.0-beta1" />
   </ItemGroup>
 
   <ItemGroup>

From 7691f8351614a6f6d915944f4be872dbc35fd76a Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Sat, 11 Nov 2023 13:12:07 +0100
Subject: [PATCH 11/12] Test build and nuget packages

---
 .github/prepare_release.sh                    |   8 +++++++-
 .github/workflows/compile.yml                 |  18 +++++++++---------
 .github/workflows/main.yml                    |   6 +++---
 LLama/LLamaSharp.Runtime.targets              |  12 ++++++------
 LLama/Native/NativeApi.cs                     |   4 ++--
 .../build/LLamaSharp.Backend.Cpu.nuspec       |   6 +++---
 .../ggml-metal.metal                          |   0
 .../{macos-arm64 => osx-arm64}/libllama.dylib | Bin
 .../{macos-x86_64 => osx-x64}/libllama.dylib  | Bin
 9 files changed, 30 insertions(+), 24 deletions(-)
 rename LLama/runtimes/{macos-arm64 => osx-arm64}/ggml-metal.metal (100%)
 rename LLama/runtimes/{macos-arm64 => osx-arm64}/libllama.dylib (100%)
 rename LLama/runtimes/{macos-x86_64 => osx-x64}/libllama.dylib (100%)

diff --git a/.github/prepare_release.sh b/.github/prepare_release.sh
index fd4427e8..e4409997 100755
--- a/.github/prepare_release.sh
+++ b/.github/prepare_release.sh
@@ -22,13 +22,19 @@ fi
 
 mkdir ./temp;
 mkdir ./temp/runtimes;
-cp ./LLama/runtimes/*.* ./temp/runtimes/;
+# For sure it could be done better but cp -R did not work on osx
+mkdir ./temp/runtimes/osx-arm64
+mkdir ./temp/runtimes/osx-x64
+cp  ./LLama/runtimes/*.* ./temp/runtimes/;
+cp  ./LLama/runtimes/osx-arm64/*.* ./temp/runtimes/osx-arm64/;
+cp  ./LLama/runtimes/osx-x64/*.* ./temp/runtimes/osx-x64;
 cp ./LLama/runtimes/build/*.* ./temp/;
 
 # get the current version
 cd temp;
 dotnet add package LLamaSharp;
 version=$(dotnet list temp.csproj package | grep LLamaSharp);
+# TODO: This didn´t work on osx...we need a solution
 read -ra arr <<< "$version"
 version="${arr[-1]}"
 echo "The latest version: $version";
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 42cb98dc..5dda197c 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -6,9 +6,9 @@ on:
       cublas:
         type: boolean
         description: Build CUBLAS binaries
-      macos:
+      osx:
         type: boolean
-        description: Build MacOS binaries
+        description: Build OSX binaries
   push:
     branches: [cron_job]
   #schedule:
@@ -147,7 +147,7 @@ jobs:
         include:
           - build: 'arm64'
             defines: '-DCMAKE_OSX_ARCHITECTURES=arm64'
-          - build: 'x86_64'
+          - build: 'x64'
             defines: '-DLLAMA_METAL=OFF  -DCMAKE_OSX_ARCHITECTURES=x86_64'            
     runs-on: macos-latest   
     steps:
@@ -169,7 +169,7 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           path: ./build/libllama.dylib
-          name: llama-bin-macos-${{ matrix.build }}.dylib
+          name: llama-bin-osx-${{ matrix.build }}.dylib
       - name: Upload Metal
         uses: actions/upload-artifact@v3
         with:
@@ -212,12 +212,12 @@ jobs:
       - name: Rearrange MacOS files
         if: ${{ github.event.inputs.macos }}
         run: |
-          mkdir deps/macos-arm64
-          mkdir deps/macos-x86_64
+          mkdir deps/osx-arm64
+          mkdir deps/osx-x64
           
-          cp artifacts/llama-bin-macos-arm64.dylib/libllama.dylib deps/macos-arm64/libllama.dylib
-          cp artifacts/ggml-metal.metal/ggml-metal.metal deps/macos-arm64/ggml-metal.metal
-          cp artifacts/llama-bin-macos-x86_64.dylib/libllama.dylib deps/macos-x86_64/libllama.dylib
+          cp artifacts/llama-bin-osx-arm64.dylib/libllama.dylib deps/osx-arm64/libllama.dylib
+          cp artifacts/ggml-metal.metal/ggml-metal.metal deps/osx-arm64/ggml-metal.metal
+          cp artifacts/llama-bin-osx-x64.dylib/libllama.dylib deps/osx-x64/libllama.dylib
 
 
       - name: Rearrange CUDA files
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index db051001..0d958e0d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -12,13 +12,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        build: [linux-release, windows-release, macos-release]
+        build: [linux-release, windows-release, osx-release]
         include:
           - build: linux-release
             os: ubuntu-latest
             config: release
-          - build: macos-release
-            os: macos-latest
+          - build: osx-release
+            os: osx-latest
             config: release
           - build: windows-release
             os: windows-2019
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index bc9a6911..c14f0ffa 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -27,17 +27,17 @@
             <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
             <Link>libllama-cuda12.so</Link>
         </None>
-        <None Include="$(MSBuildThisFileDirectory)runtimes/macos-arm64/libllama.dylib">
+        <None Include="$(MSBuildThisFileDirectory)runtimes/osx-arm64/libllama.dylib">
             <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-            <Link>runtimes/macos-arm64/libllama.dylib</Link>
+            <Link>runtimes/osx-arm64/libllama.dylib</Link>
         </None>
-        <None Include="$(MSBuildThisFileDirectory)runtimes/macos-arm64/ggml-metal.metal">
+        <None Include="$(MSBuildThisFileDirectory)runtimes/osx-arm64/ggml-metal.metal">
             <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-            <Link>runtimes/macos-arm64/ggml-metal.metal</Link>
+            <Link>runtimes/osx-arm64/ggml-metal.metal</Link>
         </None>
-        <None Include="$(MSBuildThisFileDirectory)runtimes/macos-x86_64/libllama.dylib">
+        <None Include="$(MSBuildThisFileDirectory)runtimes/osx-x64/libllama.dylib">
             <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-            <Link>runtimes/macos-x86_64/libllama.dylib</Link>
+            <Link>runtimes/osx-x64/libllama.dylib</Link>
         </None>
     </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 119a36fb..fc408678 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -79,8 +79,8 @@ namespace LLama.Native
 
             if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
             {
-                return TryLoad("runtimes/macos-arm64/libllama.dylib", System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported)
-                      ?? TryLoad("runtimes/macos-x86_64/libllama.dylib")  
+                return TryLoad("runtimes/osx-arm64/libllama.dylib", System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported)
+                      ?? TryLoad("runtimes/osx-x64/libllama.dylib")  
                       ?? IntPtr.Zero;
             }
 #endif
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index 5664be89..29466a1f 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -19,9 +19,9 @@
     <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cpu.props" />
     <file src="runtimes/libllama.dll" target="runtimes\win-x64\native\libllama.dll" />
     <file src="runtimes/libllama.so" target="runtimes\linux-x64\native\libllama.so" />
-    <file src="runtimes/macos-x86_64/libllama.dylib" target="runtimes\osx-x64\native\libllama.dylib" />
-    <file src="runtimes/macos-arm64/libllama.dylib" target="runtimes\osx-arm64\native\libllama.dylib" />
-    <file src="runtimes/macos-arm54/ggml-metal.metal" target="runtimes\osx-arm64\native\ggml-metal.metal" />
+    <file src="runtimes/osx-x64/libllama.dylib" target="runtimes\osx-x64\native\libllama.dylib" />
+    <file src="runtimes/osx-arm64/libllama.dylib" target="runtimes\osx-arm64\native\libllama.dylib" />
+    <file src="runtimes/osx-arm64/ggml-metal.metal" target="runtimes\osx-arm64\native\ggml-metal.metal" />
     <file src="icon512.png" target="icon512.png" />
   </files>
 </package>
diff --git a/LLama/runtimes/macos-arm64/ggml-metal.metal b/LLama/runtimes/osx-arm64/ggml-metal.metal
similarity index 100%
rename from LLama/runtimes/macos-arm64/ggml-metal.metal
rename to LLama/runtimes/osx-arm64/ggml-metal.metal
diff --git a/LLama/runtimes/macos-arm64/libllama.dylib b/LLama/runtimes/osx-arm64/libllama.dylib
similarity index 100%
rename from LLama/runtimes/macos-arm64/libllama.dylib
rename to LLama/runtimes/osx-arm64/libllama.dylib
diff --git a/LLama/runtimes/macos-x86_64/libllama.dylib b/LLama/runtimes/osx-x64/libllama.dylib
similarity index 100%
rename from LLama/runtimes/macos-x86_64/libllama.dylib
rename to LLama/runtimes/osx-x64/libllama.dylib

From 0a2b0abb618392cc194afde01347619ee346b573 Mon Sep 17 00:00:00 2001
From: SignalRT <admin@signalrt.com>
Date: Sun, 12 Nov 2023 00:27:53 +0100
Subject: [PATCH 12/12] Correct improper rename

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0d958e0d..eb0e936f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
             os: ubuntu-latest
             config: release
           - build: osx-release
-            os: osx-latest
+            os: macos-latest
             config: release
           - build: windows-release
             os: windows-2019