From 13be215f7307e38a3405da4d2672a02fd356316f Mon Sep 17 00:00:00 2001 From: Niklas Gustafsson Date: Fri, 29 Jan 2021 16:05:10 -0800 Subject: [PATCH] Cleaned up defaulting the string analyzer in Tokenizer. --- src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs | 6 +++++- src/TensorFlowNET.Keras/TextApi.cs | 9 +-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs index 3bf14ce5..29cbec8e 100644 --- a/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs +++ b/src/TensorFlowNET.Keras/Preprocessings/Tokenizer.cs @@ -16,6 +16,10 @@ namespace Tensorflow.Keras.Text /// (each integer being the index of a token in a dictionary) or into a vector where the coefficient for /// each token could be binary, based on word count, based on tf-idf... /// + /// + /// This code is a fairly straight port of the Python code for Keras text preprocessing found at: + /// https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py + /// public class Tokenizer { private readonly int num_words; @@ -51,7 +55,7 @@ namespace Tensorflow.Keras.Text this.split = split; this.char_level = char_level; this.oov_token = oov_token; - this.analyzer = analyzer; + this.analyzer = analyzer != null ? analyzer : (text) => TextApi.text_to_word_sequence(text, filters, lower, split); } /// diff --git a/src/TensorFlowNET.Keras/TextApi.cs b/src/TensorFlowNET.Keras/TextApi.cs index 2e62e25b..8ce8d685 100644 --- a/src/TensorFlowNET.Keras/TextApi.cs +++ b/src/TensorFlowNET.Keras/TextApi.cs @@ -17,14 +17,7 @@ namespace Tensorflow.Keras string oov_token = null, Func> analyzer = null) { - if (analyzer != null) - { - return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); - } - else - { - return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, (text) => text_to_word_sequence(text, filters, lower, split)); - } + return new Keras.Text.Tokenizer(num_words, filters, lower, split, char_level, oov_token, analyzer); } public static IEnumerable text_to_word_sequence(string text, string filters = DefaultFilter, bool lower = true, char split = ' ')