using Tensorflow.NumPy;
using Serilog.Debugging;
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.Data.SqlTypes;
using System.Linq;
using System.Net.Sockets;
using System.Text;
namespace Tensorflow.Keras.Text
{
///
/// Text tokenization API.
/// This class allows to vectorize a text corpus, by turning each text into either a sequence of integers
/// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for
/// each token could be binary, based on word count, based on tf-idf...
///
///
/// This code is a fairly straight port of the Python code for Keras text preprocessing found at:
/// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py
///
public class Tokenizer
{
private readonly int num_words;
private readonly string filters;
private readonly bool lower;
private readonly char split;
private readonly bool char_level;
private readonly string oov_token;
private readonly Func> analyzer;
private int document_count = 0;
private Dictionary word_docs = new Dictionary();
private Dictionary word_counts = new Dictionary();
public Dictionary word_index = null;
public Dictionary index_word = null;
private Dictionary index_docs = null;
public Tokenizer(
int num_words = -1,
string filters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n",
bool lower = true,
char split = ' ',
bool char_level = false,
string oov_token = null,
Func> analyzer = null)
{
this.num_words = num_words;
this.filters = filters;
this.lower = lower;
this.split = split;
this.char_level = char_level;
this.oov_token = oov_token;
this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split);
}
///
/// Updates internal vocabulary based on a list of texts.
///
/// A list of strings, each containing one or more tokens.
/// Required before using texts_to_sequences or texts_to_matrix.
public void fit_on_texts(IEnumerable texts)
{
foreach (var text in texts)
{
IEnumerable seq = null;
document_count += 1;
if (char_level)
{
throw new NotImplementedException("char_level == true");
}
else
{
seq = analyzer(lower ? text.ToLower() : text);
}
foreach (var w in seq)
{
var count = 0;
word_counts.TryGetValue(w, out count);
word_counts[w] = count + 1;
}
foreach (var w in new HashSet(seq))
{
var count = 0;
word_docs.TryGetValue(w, out count);
word_docs[w] = count + 1;
}
}
var wcounts = word_counts.AsEnumerable().ToList();
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value)); // Note: '-' gives us descending order.
var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token };
sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
if (num_words > 0 - 1)
{
sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList();
}
word_index = new Dictionary(sorted_voc.Count);
index_word = new Dictionary(sorted_voc.Count);
index_docs = new Dictionary(word_docs.Count);
for (int i = 0; i < sorted_voc.Count; i++)
{
word_index.Add(sorted_voc[i], i + 1);
index_word.Add(i + 1, sorted_voc[i]);
}
foreach (var kv in word_docs)
{
var idx = -1;
if (word_index.TryGetValue(kv.Key, out idx))
{
index_docs.Add(idx, kv.Value);
}
}
}
///
/// Updates internal vocabulary based on a list of texts.
///
/// A list of list of strings, each containing one token.
/// Required before using texts_to_sequences or texts_to_matrix.
public void fit_on_texts(IEnumerable> texts)
{
foreach (var seq in texts)
{
foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
{
var count = 0;
word_counts.TryGetValue(w, out count);
word_counts[w] = count + 1;
}
foreach (var w in new HashSet(word_counts.Keys))
{
var count = 0;
word_docs.TryGetValue(w, out count);
word_docs[w] = count + 1;
}
}
var wcounts = word_counts.AsEnumerable().ToList();
wcounts.Sort((kv1, kv2) => -kv1.Value.CompareTo(kv2.Value));
var sorted_voc = (oov_token == null) ? new List() : new List() { oov_token };
sorted_voc.AddRange(word_counts.Select(kv => kv.Key));
if (num_words > 0 - 1)
{
sorted_voc = sorted_voc.Take((oov_token == null) ? num_words : num_words + 1).ToList();
}
word_index = new Dictionary(sorted_voc.Count);
index_word = new Dictionary(sorted_voc.Count);
index_docs = new Dictionary(word_docs.Count);
for (int i = 0; i < sorted_voc.Count; i++)
{
word_index.Add(sorted_voc[i], i + 1);
index_word.Add(i + 1, sorted_voc[i]);
}
foreach (var kv in word_docs)
{
var idx = -1;
if (word_index.TryGetValue(kv.Key, out idx))
{
index_docs.Add(idx, kv.Value);
}
}
}
///
/// Updates internal vocabulary based on a list of sequences.
///
///
/// Required before using sequences_to_matrix (if fit_on_texts was never called).
public void fit_on_sequences(IEnumerable sequences)
{
throw new NotImplementedException("fit_on_sequences");
}
///
/// Transforms each string in texts to a sequence of integers.
///
///
///
/// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.
public IList texts_to_sequences(IEnumerable texts)
{
return texts_to_sequences_generator(texts).ToArray();
}
///
/// Transforms each token in texts to a sequence of integers.
///
///
///
/// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.
public IList texts_to_sequences(IEnumerable> texts)
{
return texts_to_sequences_generator(texts).ToArray();
}
public IEnumerable texts_to_sequences_generator(IEnumerable texts)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
return texts.Select(text =>
{
IEnumerable seq = null;
if (char_level)
{
throw new NotImplementedException("char_level == true");
}
else
{
seq = analyzer(lower ? text.ToLower() : text);
}
return ConvertToSequence(oov_index, seq).ToArray();
});
}
public IEnumerable texts_to_sequences_generator(IEnumerable> texts)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
return texts.Select(seq => ConvertToSequence(oov_index, seq).ToArray());
}
private List ConvertToSequence(int oov_index, IEnumerable seq)
{
var vect = new List();
foreach (var w in seq.Select(s => lower ? s.ToLower() : s))
{
var i = -1;
if (word_index.TryGetValue(w, out i))
{
if (num_words != -1 && i >= num_words)
{
if (oov_index != -1)
{
vect.Add(oov_index);
}
}
else
{
vect.Add(i);
}
}
else if (oov_index != -1)
{
vect.Add(oov_index);
}
}
return vect;
}
///
/// Transforms each sequence into a list of text.
///
///
/// A list of texts(strings)
/// Only top num_words-1 most frequent words will be taken into account.Only words known by the tokenizer will be taken into account.
public IList sequences_to_texts(IEnumerable sequences)
{
return sequences_to_texts_generator(sequences).ToArray();
}
public IEnumerable sequences_to_texts_generator(IEnumerable> sequences)
{
int oov_index = -1;
var _ = (oov_token != null) && word_index.TryGetValue(oov_token, out oov_index);
return sequences.Select(seq =>
{
var bldr = new StringBuilder();
for (var i = 0; i < seq.Count; i++)
{
if (i > 0) bldr.Append(' ');
string word = null;
if (index_word.TryGetValue(seq[i], out word))
{
if (num_words != -1 && i >= num_words)
{
if (oov_index != -1)
{
bldr.Append(oov_token);
}
}
else
{
bldr.Append(word);
}
}
else if (oov_index != -1)
{
bldr.Append(oov_token);
}
}
return bldr.ToString();
});
}
///
/// Convert a list of texts to a Numpy matrix.
///
/// A sequence of strings containing one or more tokens.
/// One of "binary", "count", "tfidf", "freq".
///
public NDArray texts_to_matrix(IEnumerable texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}
///
/// Convert a list of texts to a Numpy matrix.
///
/// A sequence of lists of strings, each containing one token.
/// One of "binary", "count", "tfidf", "freq".
///
public NDArray texts_to_matrix(IEnumerable> texts, string mode = "binary")
{
return sequences_to_matrix(texts_to_sequences(texts), mode);
}
///
/// Converts a list of sequences into a Numpy matrix.
///
/// A sequence of lists of integers, encoding tokens.
/// One of "binary", "count", "tfidf", "freq".
///
public NDArray sequences_to_matrix(IEnumerable> sequences, string mode = "binary")
{
if (!modes.Contains(mode)) throw new InvalidArgumentError($"Unknown vectorization mode: {mode}");
var word_count = 0;
if (num_words == -1)
{
if (word_index != null)
{
word_count = word_index.Count + 1;
}
else
{
throw new InvalidOperationException("Specifya dimension ('num_words' arugment), or fit on some text data first.");
}
}
else
{
word_count = num_words;
}
if (mode == "tfidf" && this.document_count == 0)
{
throw new InvalidOperationException("Fit the Tokenizer on some text data before using the 'tfidf' mode.");
}
var x = np.zeros((sequences.Count(), word_count));
for (int i = 0; i < sequences.Count(); i++)
{
var seq = sequences.ElementAt(i);
if (seq == null || seq.Count == 0)
continue;
var counts = new Dictionary();
var seq_length = seq.Count;
foreach (var j in seq)
{
if (j >= word_count)
continue;
var count = 0;
counts.TryGetValue(j, out count);
counts[j] = count + 1;
}
if (mode == "count")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value + 0.0;
x[i, j] = c;
}
}
else if (mode == "freq")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value + 0.0;
x[i, j] = ((double)c) / seq_length;
}
}
else if (mode == "binary")
{
foreach (var kv in counts)
{
var j = kv.Key;
// var c = kv.Value + 0.0;
x[i, j] = 1.0;
}
}
else if (mode == "tfidf")
{
foreach (var kv in counts)
{
var j = kv.Key;
var c = kv.Value + 0.0;
var id = 0;
var _ = index_docs.TryGetValue(j, out id);
var tf = 1.0 + np.log(c);
var idf = np.log(1.0 + document_count / (1 + id));
x[i, j] = tf * idf;
}
}
}
return x;
}
private string[] modes = new string[] { "binary", "count", "tfidf", "freq" };
}
}