| @@ -67,8 +67,26 @@ namespace Tensorflow | |||||
| string name = null, string @uint = "BYTE") | string name = null, string @uint = "BYTE") | ||||
| => ops.substr(input, pos, len, @uint: @uint, name: name); | => ops.substr(input, pos, len, @uint: @uint, name: name); | ||||
| /// <summary> | |||||
| /// String lengths of `input`. | |||||
| /// </summary> | |||||
| /// <param name="input"></param> | |||||
| /// <param name="name"></param> | |||||
| /// <param name="unit"></param> | |||||
| /// <returns></returns> | |||||
| public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||||
| => ops.string_length(input, name: name, unit: unit); | |||||
| public RaggedTensor split(Tensor input, string sep = "", int maxsplit = -1, string name = null) | public RaggedTensor split(Tensor input, string sep = "", int maxsplit = -1, string name = null) | ||||
| => ops.string_split_v2(input, sep: sep, maxsplit : maxsplit, name : name); | => ops.string_split_v2(input, sep: sep, maxsplit : maxsplit, name : name); | ||||
| public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, | |||||
| string errors = "replace", int replacement_char = 0xFFFD, | |||||
| bool replace_control_characters = false, string name = null) | |||||
| => ops.unicode_decode_with_offsets(input, input_encoding, errors, | |||||
| replacement_char: replacement_char, | |||||
| replace_control_characters: replace_control_characters, | |||||
| name: name); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -44,6 +44,22 @@ namespace Tensorflow | |||||
| => tf.Context.ExecuteOp("Substr", name, new ExecuteOpArgs(input, pos, len) | => tf.Context.ExecuteOp("Substr", name, new ExecuteOpArgs(input, pos, len) | ||||
| .SetAttributes(new { unit = @uint })); | .SetAttributes(new { unit = @uint })); | ||||
| /// <summary> | |||||
| /// Computes the length of each string given in the input tensor. | |||||
| /// </summary> | |||||
| /// <param name="input"></param> | |||||
| /// <param name="name"></param> | |||||
| /// <param name="unit"></param> | |||||
| /// <returns></returns> | |||||
| public Tensor string_length(Tensor input, string name = null, string unit = "BYTE") | |||||
| => tf.Context.ExecuteOp("StringLength", name, new ExecuteOpArgs(input) | |||||
| { | |||||
| GetGradientAttrs = op => new | |||||
| { | |||||
| unit = op.get_attr<string>("unit") | |||||
| } | |||||
| }.SetAttributes(new { unit })); | |||||
| public RaggedTensor string_split_v2(Tensor input, string sep = "", int maxsplit = -1, string name = null) | public RaggedTensor string_split_v2(Tensor input, string sep = "", int maxsplit = -1, string name = null) | ||||
| { | { | ||||
| return tf_with(ops.name_scope(name, "StringSplit"), scope => | return tf_with(ops.name_scope(name, "StringSplit"), scope => | ||||
| @@ -69,5 +85,49 @@ namespace Tensorflow | |||||
| validate: false); | validate: false); | ||||
| }); | }); | ||||
| } | } | ||||
| public (RaggedTensor, RaggedTensor) unicode_decode_with_offsets(Tensor input, string input_encoding, string errors, | |||||
| int replacement_char = 0xFFFD, bool replace_control_characters = false, string name = null) | |||||
| { | |||||
| return tf_with(ops.name_scope(name, "UnicodeDecodeWithOffsets"), scope => | |||||
| { | |||||
| var (codepoints, byte_start_offsets) = _unicode_decode(input, input_encoding, errors, | |||||
| replacement_char, replace_control_characters, | |||||
| with_offsets: true, name: name); | |||||
| return (codepoints, byte_start_offsets); | |||||
| }); | |||||
| } | |||||
| (RaggedTensor, RaggedTensor) _unicode_decode(Tensor input, string input_encoding, string errors, int replacement_char, | |||||
| bool replace_control_characters, bool with_offsets, string name = null) | |||||
| { | |||||
| if (with_offsets) | |||||
| { | |||||
| var flat_result = tf.Context.ExecuteOp("UnicodeDecodeWithOffsets", name, new ExecuteOpArgs(input) | |||||
| { | |||||
| GetGradientAttrs = op => new | |||||
| { | |||||
| input_encoding = op.get_attr<string>("input_encoding"), | |||||
| errors = op.get_attr<string>("errors"), | |||||
| replacement_char = op.get_attr<int>("replacement_char"), | |||||
| replace_control_characters = op.get_attr<bool>("replace_control_characters"), | |||||
| Tsplits = op.get_attr<TF_DataType>("Tsplits") | |||||
| } | |||||
| }.SetAttributes(new | |||||
| { | |||||
| input_encoding, | |||||
| errors, | |||||
| replacement_char, | |||||
| replace_control_characters | |||||
| })); | |||||
| var codepoints = RaggedTensor.from_row_splits(flat_result[1], flat_result[0], validate: false); | |||||
| var offsets = RaggedTensor.from_row_splits(flat_result[2], flat_result[0], validate: false); | |||||
| return (codepoints, offsets); | |||||
| } | |||||
| return (null, null); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -20,6 +20,7 @@ using System.Text; | |||||
| using System.Linq; | using System.Linq; | ||||
| using Tensorflow.Framework; | using Tensorflow.Framework; | ||||
| using static Tensorflow.Binding; | using static Tensorflow.Binding; | ||||
| using NumSharp; | |||||
| namespace Tensorflow | namespace Tensorflow | ||||
| { | { | ||||
| @@ -30,6 +31,8 @@ namespace Tensorflow | |||||
| { | { | ||||
| Tensor _values; | Tensor _values; | ||||
| RowPartition _row_partition; | RowPartition _row_partition; | ||||
| Tensor _row_splits => _row_partition.row_splits; | |||||
| public TF_DataType dtype => _values.dtype; | public TF_DataType dtype => _values.dtype; | ||||
| public TensorShape shape | public TensorShape shape | ||||
| { | { | ||||
| @@ -41,6 +44,28 @@ namespace Tensorflow | |||||
| } | } | ||||
| } | } | ||||
| public RaggedTensor this[params Slice[] slices] | |||||
| { | |||||
| get | |||||
| { | |||||
| var row_key = slices[0]; | |||||
| var inner_keys = slices.Skip(1).ToArray(); | |||||
| var args = tensor_util.ParseSlices(slices); | |||||
| return tf_with(ops.name_scope(null, "RaggedGetItem", args), scope => | |||||
| { | |||||
| string name = scope; | |||||
| return _ragged_getitem_inner_dimensions(this, inner_keys); | |||||
| }); | |||||
| } | |||||
| } | |||||
| RaggedTensor _ragged_getitem_inner_dimensions(RaggedTensor input, Slice[] slices) | |||||
| { | |||||
| return input; | |||||
| } | |||||
| public RaggedTensor(Tensor values, | public RaggedTensor(Tensor values, | ||||
| bool @internal = true, | bool @internal = true, | ||||
| RowPartition row_partition = null) | RowPartition row_partition = null) | ||||
| @@ -75,13 +100,44 @@ namespace Tensorflow | |||||
| }); | }); | ||||
| } | } | ||||
| public static RaggedTensor from_row_splits(Tensor values, Tensor row_splits, | |||||
| string name = null, bool validate = true) | |||||
| { | |||||
| return tf_with(ops.name_scope(name, "RaggedFromRowSplits"), scope => | |||||
| { | |||||
| var row_partition = RowPartition.from_row_splits(row_splits, | |||||
| validate: validate); | |||||
| return from_row_partition(values, row_partition, validate: validate); | |||||
| }); | |||||
| } | |||||
| Tensor _to_variant(bool batched_input = false, string name = null) | |||||
| => tf_with(ops.name_scope(name, "RaggedToVariant"), scope => | |||||
| { | |||||
| return tf.Context.ExecuteOp("RaggedTensorToVariant", name, | |||||
| new ExecuteOpArgs(nested_row_splits, flat_values) | |||||
| { | |||||
| GetGradientAttrs = op => new | |||||
| { | |||||
| RAGGED_RANK = op.get_attr<int>("RAGGED_RANK"), | |||||
| Tvalues = op.get_attr<TF_DataType>("Tvalues"), | |||||
| Tsplits = op.get_attr<TF_DataType>("Tsplits"), | |||||
| batched_input = op.get_attr<bool>("batched_input") | |||||
| } | |||||
| }.SetAttributes(new { batched_input })); | |||||
| }); | |||||
| Tensor flat_values | |||||
| => _values; | |||||
| Tensor[] nested_row_splits | |||||
| => new[] { _row_splits }; | |||||
| public override string ToString() | public override string ToString() | ||||
| => $"tf.RaggedTensor: shape={shape} [{string.Join(", ", _values.StringData().Take(10))}]"; | => $"tf.RaggedTensor: shape={shape} [{string.Join(", ", _values.StringData().Take(10))}]"; | ||||
| public static implicit operator Tensor(RaggedTensor indexedSlices) | public static implicit operator Tensor(RaggedTensor indexedSlices) | ||||
| { | |||||
| return indexedSlices._values; | |||||
| } | |||||
| => indexedSlices._to_variant(); | |||||
| public static implicit operator RaggedTensor(Tensor tensor) | public static implicit operator RaggedTensor(Tensor tensor) | ||||
| { | { | ||||
| @@ -28,6 +28,7 @@ namespace Tensorflow | |||||
| public class RowPartition : CompositeTensor | public class RowPartition : CompositeTensor | ||||
| { | { | ||||
| Tensor _row_splits; | Tensor _row_splits; | ||||
| public Tensor row_splits => _row_splits; | |||||
| Tensor _row_lengths; | Tensor _row_lengths; | ||||
| Tensor _value_rowids; | Tensor _value_rowids; | ||||
| Tensor _nrows; | Tensor _nrows; | ||||
| @@ -89,5 +90,14 @@ namespace Tensorflow | |||||
| nrows: nrows); | nrows: nrows); | ||||
| }); | }); | ||||
| } | } | ||||
| public static RowPartition from_row_splits(Tensor row_splits, | |||||
| bool validate = true, TF_DataType preferred_dtype = TF_DataType.DtInvalid) | |||||
| { | |||||
| return tf_with(ops.name_scope(null, "RowPartitionFromRowSplits"), scope => | |||||
| { | |||||
| return new RowPartition(row_splits); | |||||
| }); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -55,10 +55,9 @@ namespace Tensorflow.Keras.Layers | |||||
| if (inputs.shape.ndim > 1) | if (inputs.shape.ndim > 1) | ||||
| input_tensor = array_ops.squeeze(inputs, axis: new[] { -1 }); | input_tensor = array_ops.squeeze(inputs, axis: new[] { -1 }); | ||||
| if (args.Split == "whitespace") | if (args.Split == "whitespace") | ||||
| input_tensor = tf.strings.split(inputs); | |||||
| input_tensor = tf.strings.split(input_tensor); | |||||
| } | } | ||||
| return inputs; | |||||
| return input_tensor; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -1,6 +1,8 @@ | |||||
| using System; | |||||
| using NumSharp; | |||||
| using System; | |||||
| using System.Collections.Generic; | using System.Collections.Generic; | ||||
| using System.Text; | using System.Text; | ||||
| using static Tensorflow.Binding; | |||||
| namespace Tensorflow.Text.Tokenizers | namespace Tensorflow.Text.Tokenizers | ||||
| { | { | ||||
| @@ -13,7 +15,31 @@ namespace Tensorflow.Text.Tokenizers | |||||
| /// <returns></returns> | /// <returns></returns> | ||||
| public Tensor tokenize(Tensor input) | public Tensor tokenize(Tensor input) | ||||
| { | { | ||||
| tokenize_with_offsets(input); | |||||
| throw new NotImplementedException(""); | throw new NotImplementedException(""); | ||||
| } | } | ||||
| Tensor[] tokenize_with_offsets(Tensor input) | |||||
| { | |||||
| tf_with(ops.name_scope(null, "WhitespaceTokenize"), scope => | |||||
| { | |||||
| _whitespace_tokenize_with_offsets_encode_decode_wrapper(input); | |||||
| }); | |||||
| throw new NotImplementedException(""); | |||||
| } | |||||
| Tensor _whitespace_tokenize_with_offsets_encode_decode_wrapper(Tensor input_tensor) | |||||
| { | |||||
| // Decode the strings and get byte offsets | |||||
| var (codepoints, byte_start_offsets) = tf.strings.unicode_decode_with_offsets(input_tensor, "UTF-8"); | |||||
| var byte_end_offsets = array_ops.concat(new Tensor[] | |||||
| { | |||||
| byte_start_offsets[Slice.All, new Slice(1)], | |||||
| math_ops.cast( | |||||
| array_ops.expand_dims(tf.strings.string_length(input_tensor), 1), | |||||
| dtypes.int64) | |||||
| }, 1); | |||||
| return input_tensor; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -10,10 +10,12 @@ namespace TensorFlowNET.UnitTest.Text | |||||
| [TestClass] | [TestClass] | ||||
| public class TokenizerTest | public class TokenizerTest | ||||
| { | { | ||||
| [TestMethod] | |||||
| [TestMethod, Ignore] | |||||
| public void Tokenize() | public void Tokenize() | ||||
| { | { | ||||
| var docs = tf.constant(new[] { "Everything not saved will be lost." }); | var docs = tf.constant(new[] { "Everything not saved will be lost." }); | ||||
| var tokenizer = text.WhitespaceTokenizer(); | |||||
| var tokens = tokenizer.tokenize(docs); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||