From daf09eae6483deb7d22617a4948f2208d99f3375 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 12 Sep 2023 01:03:27 +0100 Subject: [PATCH] Skipping tokenization of empty strings (saves allocating an empty array every time) --- LLama.Unittest/LLamaContextTests.cs | 24 ++++++++++++++++++++++++ LLama/Native/SafeLLamaContextHandle.cs | 3 +++ 2 files changed, 27 insertions(+) diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs index d2947624..198511f1 100644 --- a/LLama.Unittest/LLamaContextTests.cs +++ b/LLama.Unittest/LLamaContextTests.cs @@ -32,5 +32,29 @@ namespace LLama.Unittest Assert.Equal(32000, _context.VocabCount); Assert.Equal(0, _context.KVCacheTokenCount); } + + [Fact] + public void Tokenize() + { + var tokens = _context.Tokenize("The quick brown fox", true); + + Assert.Equal(new[] { 1, 450, 4996, 17354, 1701, 29916 }, tokens); + } + + [Fact] + public void TokenizeWithoutBOS() + { + var tokens = _context.Tokenize("The quick brown fox", false); + + Assert.Equal(new[] { 450, 4996, 17354, 1701, 29916 }, tokens); + } + + [Fact] + public void TokenizeEmpty() + { + var tokens = _context.Tokenize("", false); + + Assert.Equal(Array.Empty(), tokens); + } } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 228ccde3..26fd011b 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -146,6 +146,9 @@ namespace LLama.Native { ThrowIfDisposed(); + if (string.IsNullOrEmpty(text) && !add_bos) + return Array.Empty(); + // Calculate number of bytes in string, this is a pessimistic estimate of token count. It can't // possibly be more than this. var count = encoding.GetByteCount(text) + (add_bos ? 1 : 0);