| @@ -67,8 +67,26 @@ namespace Tensorflow | |||
| string name = null, string @uint = "BYTE") | |||
| => ops.substr(input, pos, len, @uint: @uint, name: name); | |||
| /// <summary> | |||
| /// String lengths of `input`. | |||
| /// </summary> | |||
| /// <param name="input"></param> | |||
| /// <param name="name"></param> | |||
| /// <param name="unit"></param> | |||
| /// <returns></returns> | |||
| public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||
| => ops.string_length(input, name: name, unit: unit); | |||
| public RaggedTensor split(Tensor input, string sep = "", int maxsplit = -1, string name = null) | |||
| => ops.string_split_v2(input, sep: sep, maxsplit : maxsplit, name : name); | |||
| public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, | |||
| string errors = "replace", int replacement_char = 0xFFFD, | |||
| bool replace_control_characters = false, string name = null) | |||
| => ops.unicode_decode_with_offsets(input, input_encoding, errors, | |||
| replacement_char: replacement_char, | |||
| replace_control_characters: replace_control_characters, | |||
| name: name); | |||
| } | |||
| } | |||
| } | |||
| @@ -44,6 +44,22 @@ namespace Tensorflow | |||
| => tf.Context.ExecuteOp("Substr", name, new ExecuteOpArgs(input, pos, len) | |||
| .SetAttributes(new { unit = @uint })); | |||
| /// <summary> | |||
| /// Computes the length of each string given in the input tensor. | |||
| /// </summary> | |||
| /// <param name="input"></param> | |||
| /// <param name="name"></param> | |||
| /// <param name="unit"></param> | |||
| /// <returns></returns> | |||
| public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||
| => tf.Context.ExecuteOp("StringLength", name, new ExecuteOpArgs(input) | |||
| { | |||
| GetGradientAttrs = op => new | |||
| { | |||
| unit = op.get_attr<string>("unit") | |||
| } | |||
| }.SetAttributes(new { unit })); | |||
| public RaggedTensor string_split_v2(Tensor input, string sep = "", int maxsplit = -1, string name = null) | |||
| { | |||
| return tf_with(ops.name_scope(name, "StringSplit"), scope => | |||
| @@ -69,5 +85,49 @@ namespace Tensorflow | |||
| validate: false); | |||
| }); | |||
| } | |||
| public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, string errors, | |||
| int replacement_char = 0xFFFD, bool replace_control_characters = false, string name = null) | |||
| { | |||
| return tf_with(ops.name_scope(name, "UnicodeDecodeWithOffsets"), scope => | |||
| { | |||
| var (codepoints, byte_start_offsets) = _unicode_decode(input, input_encoding, errors, | |||
| replacement_char, replace_control_characters, | |||
| with_offsets: true, name: name); | |||
| return (codepoints, byte_start_offsets); | |||
| }); | |||
| } | |||
| (RaggedTensor, RaggedTensor) _unicode_decode(Tensor input, string input_encoding, string errors, int replacement_char, | |||
| bool replace_control_characters, bool with_offsets, string name = null) | |||
| { | |||
| if (with_offsets) | |||
| { | |||
| var flat_result = tf.Context.ExecuteOp("UnicodeDecodeWithOffsets", name, new ExecuteOpArgs(input) | |||
| { | |||
| GetGradientAttrs = op => new | |||
| { | |||
| input_encoding = op.get_attr<string>("input_encoding"), | |||
| errors = op.get_attr<string>("errors"), | |||
| replacement_char = op.get_attr<int>("replacement_char"), | |||
| replace_control_characters = op.get_attr<bool>("replace_control_characters"), | |||
| Tsplits = op.get_attr<TF_DataType>("Tsplits") | |||
| } | |||
| }.SetAttributes(new | |||
| { | |||
| input_encoding, | |||
| errors, | |||
| replacement_char, | |||
| replace_control_characters | |||
| })); | |||
| var codepoints = RaggedTensor.from_row_splits(flat_result[1], flat_result[0], validate: false); | |||
| var offsets = RaggedTensor.from_row_splits(flat_result[2], flat_result[0], validate: false); | |||
| return (codepoints, offsets); | |||
| } | |||
| return (null, null); | |||
| } | |||
| } | |||
| } | |||
| @@ -20,6 +20,7 @@ using System.Text; | |||
| using System.Linq; | |||
| using Tensorflow.Framework; | |||
| using static Tensorflow.Binding; | |||
| using NumSharp; | |||
| namespace Tensorflow | |||
| { | |||
| @@ -30,6 +31,8 @@ namespace Tensorflow | |||
| { | |||
| Tensor _values; | |||
| RowPartition _row_partition; | |||
| Tensor _row_splits => _row_partition.row_splits; | |||
| public TF_DataType dtype => _values.dtype; | |||
| public TensorShape shape | |||
| { | |||
| @@ -41,6 +44,28 @@ namespace Tensorflow | |||
| } | |||
| } | |||
| public RaggedTensor this[params Slice[] slices] | |||
| { | |||
| get | |||
| { | |||
| var row_key = slices[0]; | |||
| var inner_keys = slices.Skip(1).ToArray(); | |||
| var args = tensor_util.ParseSlices(slices); | |||
| return tf_with(ops.name_scope(null, "RaggedGetItem", args), scope => | |||
| { | |||
| string name = scope; | |||
| return _ragged_getitem_inner_dimensions(this, inner_keys); | |||
| }); | |||
| } | |||
| } | |||
| RaggedTensor _ragged_getitem_inner_dimensions(RaggedTensor input, Slice[] slices) | |||
| { | |||
| return input; | |||
| } | |||
| public RaggedTensor(Tensor values, | |||
| bool @internal = true, | |||
| RowPartition row_partition = null) | |||
| @@ -75,13 +100,44 @@ namespace Tensorflow | |||
| }); | |||
| } | |||
| public static RaggedTensor from_row_splits(Tensor values, Tensor row_splits, | |||
| string name = null, bool validate = true) | |||
| { | |||
| return tf_with(ops.name_scope(name, "RaggedFromRowSplits"), scope => | |||
| { | |||
| var row_partition = RowPartition.from_row_splits(row_splits, | |||
| validate: validate); | |||
| return from_row_partition(values, row_partition, validate: validate); | |||
| }); | |||
| } | |||
| Tensor _to_variant(bool batched_input = false, string name = null) | |||
| => tf_with(ops.name_scope(name, "RaggedToVariant"), scope => | |||
| { | |||
| return tf.Context.ExecuteOp("RaggedTensorToVariant", name, | |||
| new ExecuteOpArgs(nested_row_splits, flat_values) | |||
| { | |||
| GetGradientAttrs = op => new | |||
| { | |||
| RAGGED_RANK = op.get_attr<int>("RAGGED_RANK"), | |||
| Tvalues = op.get_attr<TF_DataType>("Tvalues"), | |||
| Tsplits = op.get_attr<TF_DataType>("Tsplits"), | |||
| batched_input = op.get_attr<bool>("batched_input") | |||
| } | |||
| }.SetAttributes(new { batched_input })); | |||
| }); | |||
| Tensor flat_values | |||
| => _values; | |||
| Tensor[] nested_row_splits | |||
| => new[] { _row_splits }; | |||
| public override string ToString() | |||
| => $"tf.RaggedTensor: shape={shape} [{string.Join(", ", _values.StringData().Take(10))}]"; | |||
| public static implicit operator Tensor(RaggedTensor indexedSlices) | |||
| { | |||
| return indexedSlices._values; | |||
| } | |||
| => indexedSlices._to_variant(); | |||
| public static implicit operator RaggedTensor(Tensor tensor) | |||
| { | |||
| @@ -28,6 +28,7 @@ namespace Tensorflow | |||
| public class RowPartition : CompositeTensor | |||
| { | |||
| Tensor _row_splits; | |||
| public Tensor row_splits => _row_splits; | |||
| Tensor _row_lengths; | |||
| Tensor _value_rowids; | |||
| Tensor _nrows; | |||
| @@ -89,5 +90,14 @@ namespace Tensorflow | |||
| nrows: nrows); | |||
| }); | |||
| } | |||
| public static RowPartition from_row_splits(Tensor row_splits, | |||
| bool validate = true, TF_DataType preferred_dtype = TF_DataType.DtInvalid) | |||
| { | |||
| return tf_with(ops.name_scope(null, "RowPartitionFromRowSplits"), scope => | |||
| { | |||
| return new RowPartition(row_splits); | |||
| }); | |||
| } | |||
| } | |||
| } | |||
| @@ -55,10 +55,9 @@ namespace Tensorflow.Keras.Layers | |||
| if (inputs.shape.ndim > 1) | |||
| input_tensor = array_ops.squeeze(inputs, axis: new[] { -1 }); | |||
| if (args.Split == "whitespace") | |||
| input_tensor = tf.strings.split(inputs); | |||
| input_tensor = tf.strings.split(input_tensor); | |||
| } | |||
| return inputs; | |||
| return input_tensor; | |||
| } | |||
| } | |||
| } | |||
| @@ -1,6 +1,8 @@ | |||
| using System; | |||
| using NumSharp; | |||
| using System; | |||
| using System.Collections.Generic; | |||
| using System.Text; | |||
| using static Tensorflow.Binding; | |||
| namespace Tensorflow.Text.Tokenizers | |||
| { | |||
| @@ -13,7 +15,31 @@ namespace Tensorflow.Text.Tokenizers | |||
| /// <returns></returns> | |||
| public Tensor tokenize(Tensor input) | |||
| { | |||
| tokenize_with_offsets(input); | |||
| throw new NotImplementedException(""); | |||
| } | |||
| Tensor[] tokenize_with_offsets(Tensor input) | |||
| { | |||
| tf_with(ops.name_scope(null, "WhitespaceTokenize"), scope => | |||
| { | |||
| _whitespace_tokenize_with_offsets_encode_decode_wrapper(input); | |||
| }); | |||
| throw new NotImplementedException(""); | |||
| } | |||
| Tensor _whitespace_tokenize_with_offsets_encode_decode_wrapper(Tensor input_tensor) | |||
| { | |||
| // Decode the strings and get byte offsets | |||
| var (codepoints, byte_start_offsets) = tf.strings.unicode_decode_with_offsets(input_tensor, "UTF-8"); | |||
| var byte_end_offsets = array_ops.concat(new Tensor[] | |||
| { | |||
| byte_start_offsets[Slice.All, new Slice(1)], | |||
| math_ops.cast( | |||
| array_ops.expand_dims(tf.strings.string_length(input_tensor), 1), | |||
| dtypes.int64) | |||
| }, 1); | |||
| return input_tensor; | |||
| } | |||
| } | |||
| } | |||
| @@ -10,10 +10,12 @@ namespace TensorFlowNET.UnitTest.Text | |||
| [TestClass] | |||
| public class TokenizerTest | |||
| { | |||
| [TestMethod] | |||
| [TestMethod, Ignore] | |||
| public void Tokenize() | |||
| { | |||
| var docs = tf.constant(new[] { "Everything not saved will be lost." }); | |||
| var tokenizer = text.WhitespaceTokenizer(); | |||
| var tokens = tokenizer.tokenize(docs); | |||
| } | |||
| } | |||
| } | |||