using NumSharp; using Serilog.Debugging; using System; using System.Collections.Generic; using System.Collections.Specialized; using System.Linq; using System.Net.Sockets; using System.Text; namespace Tensorflow.Keras.Text { /// /// Text tokenization API. /// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for /// each token could be binary, based on word count, based on tf-idf... /// public class Tokenizer { private readonly int num_words; private readonly string filters; private readonly bool lower; private readonly char split; private readonly bool char_level; private readonly string oov_token; private readonly Func> analyzer; private int document_count = 0; private Dictionary word_docs = new Dictionary(); private Dictionary word_counts = new Dictionary(); public Dictionary word_index = null; public Dictionary index_word = null; private Dictionary index_docs = null; public Tokenizer( int num_words = -1, string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", bool lower = true, char split = ' ', bool char_level = false, string oov_token = null, Func> analyzer = null) { this.num_words = num_words; this.filters = filters; this.lower = lower; this.split = split; this.char_level = char_level; this.oov_token = oov_token; this.analyzer = analyzer; } /// /// Updates internal vocabulary based on a list of texts. /// /// A list of strings, each containing one or more tokens. /// Required before using texts_to_sequences or texts_to_matrix. public void fit_on_texts(IEnumerable texts) { foreach (var text in texts) { IEnumerable seq = null; document_count += 1; if (char_level) { throw new NotImplementedException("char_level == true"); } else { seq = analyzer(lower ? text.ToLower() : text); } foreach (var w in seq) { var count = 0; word_counts.TryGetValue(w, out count); word_counts[w] = count + 1; } foreach (var w in new HashSet(seq)) { var count = 0; word_docs.TryGetValue(w, out count); word_docs[w] = count + 1; } } var wcounts = word_counts.AsEnumerable().ToList(); wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order. var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token }; sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); if (num_words > 0 - 1) { sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList(); } word_index = new Dictionary(sorted_voc.Count); index_word = new Dictionary(sorted_voc.Count); index_docs = new Dictionary(word_docs.Count); for (int i = 0; i < sorted_voc.Count; i++) { word_index.Add(sorted_voc[i], i + 1); index_word.Add(i + 1, sorted_voc[i]); } foreach (var kv in word_docs) { var idx = -1; if (word_index.TryGetValue(kv.Key, out idx)) { index_docs.Add(idx, kv.Value); } } } /// /// Updates internal vocabulary based on a list of texts. /// /// A list of list of strings, each containing one token. /// Required before using texts_to_sequences or texts_to_matrix. public void fit_on_texts(IEnumerable> texts) { foreach (var seq in texts) { foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) { var count = 0; word_counts.TryGetValue(w, out count); word_counts[w] = count + 1; } foreach (var w in new HashSet(word_counts.Keys)) { var count = 0; word_docs.TryGetValue(w, out count); word_docs[w] = count + 1; } } var wcounts = word_counts.AsEnumerable().ToList(); wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token }; sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); if (num_words > 0 - 1) { sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList(); } word_index = new Dictionary(sorted_voc.Count); index_word = new Dictionary(sorted_voc.Count); index_docs = new Dictionary(word_docs.Count); for (int i = 0; i < sorted_voc.Count; i++) { word_index.Add(sorted_voc[i], i + 1); index_word.Add(i + 1, sorted_voc[i]); } foreach (var kv in word_docs) { var idx = -1; if (word_index.TryGetValue(kv.Key, out idx)) { index_docs.Add(idx, kv.Value); } } } /// /// Updates internal vocabulary based on a list of sequences. /// /// /// Required before using sequences_to_matrix (if fit_on_texts was never called). public void fit_on_sequences(IEnumerable sequences) { throw new NotImplementedException("fit_on_sequences"); } /// /// Transforms each string in texts to a sequence of integers. /// /// /// /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList texts_to_sequences(IEnumerable texts) { return texts_to_sequences_generator(texts).ToArray(); } /// /// Transforms each token in texts to a sequence of integers. /// /// /// /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList texts_to_sequences(IEnumerable> texts) { return texts_to_sequences_generator(texts).ToArray(); } public IEnumerable texts_to_sequences_generator(IEnumerable texts) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return texts.Select(text => { IEnumerable seq = null; if (char_level) { throw new NotImplementedException("char_level == true"); } else { seq = analyzer(lower ? text.ToLower() : text); } return ConvertToSequence(oov_index, seq).ToArray(); }); } public IEnumerable texts_to_sequences_generator(IEnumerable> texts) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); } private List ConvertToSequence(int oov_index, IEnumerable seq) { var vect = new List(); foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) { var i = -1; if (word_index.TryGetValue(w, out i)) { if (num_words != -1 && i >= num_words) { if (oov_index != -1) { vect.Add(oov_index); } } else { vect.Add(i); } } else if (oov_index != -1) { vect.Add(oov_index); } } return vect; } /// /// Transforms each sequence into a list of text. /// /// /// A list of texts(strings) /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList sequences_to_texts(IEnumerable sequences) { return sequences_to_texts_generator(sequences).ToArray(); } public IEnumerable sequences_to_texts_generator(IEnumerable> sequences) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return sequences.Select(seq => { var bldr = new StringBuilder(); for (var i = 0; i < seq.Count; i++) { if (i > 0) bldr.Append(' '); string word = null; if (index_word.TryGetValue(seq[i], out word)) { if (num_words != -1 && i >= num_words) { if (oov_index != -1) { bldr.Append(oov_token); } } else { bldr.Append(word); } } else if (oov_index != -1) { bldr.Append(oov_token); } } return bldr.ToString(); }); } /// /// Converts a list of sequences into a Numpy matrix. /// /// /// public NDArray sequences_to_matrix(IEnumerable> sequences) { throw new NotImplementedException("sequences_to_matrix"); } } }