using Tensorflow.NumPy; using Serilog.Debugging; using System; using System.Collections.Generic; using System.Collections.Specialized; using System.Data.SqlTypes; using System.Linq; using System.Net.Sockets; using System.Text; namespace Tensorflow.Keras.Text { /// /// Text tokenization API. /// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for /// each token could be binary, based on word count, based on tf-idf... /// /// /// This code is a fairly straight port of the Python code for Keras text preprocessing found at: /// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py /// public class Tokenizer { private readonly int num_words; private readonly string filters; private readonly bool lower; private readonly char split; private readonly bool char_level; private readonly string oov_token; private readonly Func> analyzer; private int document_count = 0; private Dictionary word_docs = new Dictionary(); private Dictionary word_counts = new Dictionary(); public Dictionary word_index = null; public Dictionary index_word = null; private Dictionary index_docs = null; public Tokenizer( int num_words = -1, string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", bool lower = true, char split = ' ', bool char_level = false, string oov_token = null, Func> analyzer = null) { this.num_words = num_words; this.filters = filters; this.lower = lower; this.split = split; this.char_level = char_level; this.oov_token = oov_token; this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split); } /// /// Updates internal vocabulary based on a list of texts. /// /// A list of strings, each containing one or more tokens. /// Required before using texts_to_sequences or texts_to_matrix. public void fit_on_texts(IEnumerable texts) { foreach (var text in texts) { IEnumerable seq = null; document_count += 1; if (char_level) { throw new NotImplementedException("char_level == true"); } else { seq = analyzer(lower ? text.ToLower() : text); } foreach (var w in seq) { var count = 0; word_counts.TryGetValue(w, out count); word_counts[w] = count + 1; } foreach (var w in new HashSet(seq)) { var count = 0; word_docs.TryGetValue(w, out count); word_docs[w] = count + 1; } } var wcounts = word_counts.AsEnumerable().ToList(); wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order. var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token }; sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); if (num_words > 0 - 1) { sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList(); } word_index = new Dictionary(sorted_voc.Count); index_word = new Dictionary(sorted_voc.Count); index_docs = new Dictionary(word_docs.Count); for (int i = 0; i < sorted_voc.Count; i++) { word_index.Add(sorted_voc[i], i + 1); index_word.Add(i + 1, sorted_voc[i]); } foreach (var kv in word_docs) { var idx = -1; if (word_index.TryGetValue(kv.Key, out idx)) { index_docs.Add(idx, kv.Value); } } } /// /// Updates internal vocabulary based on a list of texts. /// /// A list of list of strings, each containing one token. /// Required before using texts_to_sequences or texts_to_matrix. public void fit_on_texts(IEnumerable> texts) { foreach (var seq in texts) { foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) { var count = 0; word_counts.TryGetValue(w, out count); word_counts[w] = count + 1; } foreach (var w in new HashSet(word_counts.Keys)) { var count = 0; word_docs.TryGetValue(w, out count); word_docs[w] = count + 1; } } var wcounts = word_counts.AsEnumerable().ToList(); wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token }; sorted_voc.AddRange(word_counts.Select(kv => kv.Key)); if (num_words > 0 - 1) { sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList(); } word_index = new Dictionary(sorted_voc.Count); index_word = new Dictionary(sorted_voc.Count); index_docs = new Dictionary(word_docs.Count); for (int i = 0; i < sorted_voc.Count; i++) { word_index.Add(sorted_voc[i], i + 1); index_word.Add(i + 1, sorted_voc[i]); } foreach (var kv in word_docs) { var idx = -1; if (word_index.TryGetValue(kv.Key, out idx)) { index_docs.Add(idx, kv.Value); } } } /// /// Updates internal vocabulary based on a list of sequences. /// /// /// Required before using sequences_to_matrix (if fit_on_texts was never called). public void fit_on_sequences(IEnumerable sequences) { throw new NotImplementedException("fit_on_sequences"); } /// /// Transforms each string in texts to a sequence of integers. /// /// /// /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList texts_to_sequences(IEnumerable texts) { return texts_to_sequences_generator(texts).ToArray(); } /// /// Transforms each token in texts to a sequence of integers. /// /// /// /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList texts_to_sequences(IEnumerable> texts) { return texts_to_sequences_generator(texts).ToArray(); } public IEnumerable texts_to_sequences_generator(IEnumerable texts) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return texts.Select(text => { IEnumerable seq = null; if (char_level) { throw new NotImplementedException("char_level == true"); } else { seq = analyzer(lower ? text.ToLower() : text); } return ConvertToSequence(oov_index, seq).ToArray(); }); } public IEnumerable texts_to_sequences_generator(IEnumerable> texts) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray()); } private List ConvertToSequence(int oov_index, IEnumerable seq) { var vect = new List(); foreach (var w in seq.Select(s => lower ? s.ToLower() : s)) { var i = -1; if (word_index.TryGetValue(w, out i)) { if (num_words != -1 && i >= num_words) { if (oov_index != -1) { vect.Add(oov_index); } } else { vect.Add(i); } } else if (oov_index != -1) { vect.Add(oov_index); } } return vect; } /// /// Transforms each sequence into a list of text. /// /// /// A list of texts(strings) /// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account. public IList sequences_to_texts(IEnumerable sequences) { return sequences_to_texts_generator(sequences).ToArray(); } public IEnumerable sequences_to_texts_generator(IEnumerable> sequences) { int oov_index = -1; var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index); return sequences.Select(seq => { var bldr = new StringBuilder(); for (var i = 0; i < seq.Count; i++) { if (i > 0) bldr.Append(' '); string word = null; if (index_word.TryGetValue(seq[i], out word)) { if (num_words != -1 && i >= num_words) { if (oov_index != -1) { bldr.Append(oov_token); } } else { bldr.Append(word); } } else if (oov_index != -1) { bldr.Append(oov_token); } } return bldr.ToString(); }); } /// /// Convert a list of texts to a Numpy matrix. /// /// A sequence of strings containing one or more tokens. /// One of "binary", "count", "tfidf", "freq". /// public NDArray texts_to_matrix(IEnumerable texts, string mode = "binary") { return sequences_to_matrix(texts_to_sequences(texts), mode); } /// /// Convert a list of texts to a Numpy matrix. /// /// A sequence of lists of strings, each containing one token. /// One of "binary", "count", "tfidf", "freq". /// public NDArray texts_to_matrix(IEnumerable> texts, string mode = "binary") { return sequences_to_matrix(texts_to_sequences(texts), mode); } /// /// Converts a list of sequences into a Numpy matrix. /// /// A sequence of lists of integers, encoding tokens. /// One of "binary", "count", "tfidf", "freq". /// public NDArray sequences_to_matrix(IEnumerable> sequences, string mode = "binary") { if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}"); var word_count = 0; if (num_words == -1) { if (word_index != null) { word_count = word_index.Count + 1; } else { throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first."); } } else { word_count = num_words; } if (mode == "tfidf" && this.document_count == 0) { throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode."); } var x = np.zeros((sequences.Count(), word_count)); for (int i = 0; i < sequences.Count(); i++) { var seq = sequences.ElementAt(i); if (seq == null || seq.Count == 0) continue; var counts = new Dictionary(); var seq_length = seq.Count; foreach (var j in seq) { if (j >= word_count) continue; var count = 0; counts.TryGetValue(j, out count); counts[j] = count + 1; } if (mode == "count") { foreach (var kv in counts) { var j = kv.Key; var c = kv.Value + 0.0; x[i, j] = c; } } else if (mode == "freq") { foreach (var kv in counts) { var j = kv.Key; var c = kv.Value + 0.0; x[i, j] = ((double)c) / seq_length; } } else if (mode == "binary") { foreach (var kv in counts) { var j = kv.Key; // var c = kv.Value + 0.0; x[i, j] = 1.0; } } else if (mode == "tfidf") { foreach (var kv in counts) { var j = kv.Key; var c = kv.Value + 0.0; var id = 0; var _ = index_docs.TryGetValue(j, out id); var tf = 1.0 + np.log(c); var idf = np.log(1.0 + document_count / (1 + id)); x[i, j] = tf * idf; } } } return x; } private string[] modes = new string[] { "binary", "count", "tfidf", "freq" }; } }