diff --git a/src/TensorFlowNET.Keras/Datasets/Imdb.cs b/src/TensorFlowNET.Keras/Datasets/Imdb.cs index 0266b48b..49fc7925 100644 --- a/src/TensorFlowNET.Keras/Datasets/Imdb.cs +++ b/src/TensorFlowNET.Keras/Datasets/Imdb.cs @@ -94,8 +94,6 @@ namespace Tensorflow.Keras.Datasets var fileBytes = File.ReadAllBytes(path); var (x_train, x_test) = LoadX(fileBytes); var (labels_train, labels_test) = LoadY(fileBytes); - x_test.astype(np.int32); - labels_test.astype(np.int32); var indices = np.arange(len(x_train)); np.random.shuffle(indices, seed); @@ -107,67 +105,80 @@ namespace Tensorflow.Keras.Datasets x_test = x_test[indices]; labels_test = labels_test[indices]; + var x_train_array = (int[,])x_train.ToMultiDimArray(); + var x_test_array = (int[,])x_test.ToMultiDimArray(); + var labels_train_array = (long[])labels_train.ToArray(); + var labels_test_array = (long[])labels_test.ToArray(); + if (start_char != null) { - int[,] new_x_train = new int[x_train.shape[0], x_train.shape[1] + 1]; - for (var i = 0; i < x_train.shape[0]; i++) + int[,] new_x_train_array = new int[x_train_array.GetLength(0), x_train_array.GetLength(1) + 1]; + for (var i = 0; i < x_train_array.GetLength(0); i++) { - new_x_train[i, 0] = (int)start_char; - for (var j = 0; j < x_train.shape[1]; j++) + new_x_train_array[i, 0] = (int)start_char; + for (var j = 0; j < x_train_array.GetLength(1); j++) { - new_x_train[i, j + 1] = x_train[i][j]; + if (x_train_array[i, j] == 0) + break; + new_x_train_array[i, j + 1] = x_train_array[i, j]; } } - int[,] new_x_test = new int[x_test.shape[0], x_test.shape[1] + 1]; - for (var i = 0; i < x_test.shape[0]; i++) + int[,] new_x_test_array = new int[x_test_array.GetLength(0), x_test_array.GetLength(1) + 1]; + for (var i = 0; i < x_test_array.GetLength(0); i++) { - new_x_test[i, 0] = (int)start_char; - for (var j = 0; j < x_test.shape[1]; j++) + new_x_test_array[i, 0] = (int)start_char; + for (var j = 0; j < x_test_array.GetLength(1); j++) { - new_x_test[i, j + 1] = x_test[i][j]; + if (x_test_array[i, j] == 0) + break; + new_x_test_array[i, j + 1] = x_test_array[i, j]; } } - x_train = new NDArray(new_x_train); - x_test = new NDArray(new_x_test); + x_train_array = new_x_train_array; + x_test_array = new_x_test_array; } else if (index_from != 0) { - for (var i = 0; i < x_train.shape[0]; i++) + for (var i = 0; i < x_train_array.GetLength(0); i++) { - for (var j = 0; j < x_train.shape[1]; j++) + for (var j = 0; j < x_train_array.GetLength(1); j++) { - if (x_train[i, j] != 0) - x_train[i, j] += index_from; + if (x_train_array[i, j] == 0) + break; + x_train_array[i, j] += index_from; } } - for (var i = 0; i < x_test.shape[0]; i++) + for (var i = 0; i < x_test_array.GetLength(0); i++) { - for (var j = 0; j < x_test.shape[1]; j++) + for (var j = 0; j < x_test_array.GetLength(1); j++) { - if (x_test[i, j] != 0) - x_test[i, j] += index_from; + if (x_test_array[i, j] == 0) + break; + x_test[i, j] += index_from; } } } - if (maxlen != null) + if (maxlen == null) { - (x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train, labels_train); - (x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test, labels_test); - if (x_train.size == 0 || x_test.size == 0) - throw new ValueError("After filtering for sequences shorter than maxlen=" + - $"{maxlen}, no sequence was kept. Increase maxlen."); + maxlen = max(x_train_array.GetLength(1), x_test_array.GetLength(1)); } + (x_train, labels_train) = data_utils._remove_long_seq((int)maxlen, x_train_array, labels_train_array); + (x_test, labels_test) = data_utils._remove_long_seq((int)maxlen, x_test_array, labels_test_array); + if (x_train.size == 0 || x_test.size == 0) + throw new ValueError("After filtering for sequences shorter than maxlen=" + + $"{maxlen}, no sequence was kept. Increase maxlen."); var xs = np.concatenate(new[] { x_train, x_test }); var labels = np.concatenate(new[] { labels_train, labels_test }); + var xs_array = (int[,])xs.ToMultiDimArray(); - if(num_words == null) + if (num_words == null) { num_words = 0; - for (var i = 0; i < xs.shape[0]; i++) - for (var j = 0; j < xs.shape[1]; j++) - num_words = max((int)num_words, (int)xs[i][j]); + for (var i = 0; i < xs_array.GetLength(0); i++) + for (var j = 0; j < xs_array.GetLength(1); j++) + num_words = max((int)num_words, (int)xs_array[i, j]); } // by convention, use 2 as OOV word @@ -175,32 +186,32 @@ namespace Tensorflow.Keras.Datasets // 0 (padding), 1 (start), 2 (OOV) if (oov_char != null) { - int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; - for(var i = 0; i < xs.shape[0]; i++) + int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; + for (var i = 0; i < xs_array.GetLength(0); i++) { - for(var j = 0; j < xs.shape[1]; j++) + for (var j = 0; j < xs_array.GetLength(1); j++) { - if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) - new_xs[i, j] = (int)xs[i][j]; + if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) + new_xs_array[i, j] = xs_array[i, j]; else - new_xs[i, j] = (int)oov_char; + new_xs_array[i, j] = (int)oov_char; } } - xs = new NDArray(new_xs); + xs = new NDArray(new_xs_array); } else { - int[,] new_xs = new int[xs.shape[0], xs.shape[1]]; - for (var i = 0; i < xs.shape[0]; i++) + int[,] new_xs_array = new int[xs_array.GetLength(0), xs_array.GetLength(1)]; + for (var i = 0; i < xs_array.GetLength(0); i++) { int k = 0; - for (var j = 0; j < xs.shape[1]; j++) + for (var j = 0; j < xs_array.GetLength(1); j++) { - if ((int)xs[i][j] == 0 || skip_top <= (int)xs[i][j] && (int)xs[i][j] < num_words) - new_xs[i, k++] = (int)xs[i][j]; + if (xs_array[i, j] == 0 || skip_top <= xs_array[i, j] && xs_array[i, j] < num_words) + new_xs_array[i, k++] = xs_array[i, j]; } } - xs = new NDArray(new_xs); + xs = new NDArray(new_xs_array); } var idx = len(x_train); diff --git a/src/TensorFlowNET.Keras/Utils/data_utils.cs b/src/TensorFlowNET.Keras/Utils/data_utils.cs index 16b121b0..57ae7669 100644 --- a/src/TensorFlowNET.Keras/Utils/data_utils.cs +++ b/src/TensorFlowNET.Keras/Utils/data_utils.cs @@ -54,23 +54,25 @@ namespace Tensorflow.Keras.Utils */ List new_seq = new List(); - List new_label = new List(); + List new_label = new List(); - for (var i = 0; i < seq.shape[0]; i++) + var seq_array = (int[,])seq.ToMultiDimArray(); + var label_array = (long[])label.ToArray(); + for (var i = 0; i < seq_array.GetLength(0); i++) { - if (maxlen < seq.shape[1] && seq[i][maxlen] != 0) + if (maxlen < seq_array.GetLength(1) && seq_array[i,maxlen] != 0) continue; int[] sentence = new int[maxlen]; - for (var j = 0; j < maxlen && j < seq.shape[1]; j++) + for (var j = 0; j < maxlen && j < seq_array.GetLength(1); j++) { - sentence[j] = seq[i, j]; + sentence[j] = seq_array[i, j]; } new_seq.Add(sentence); - new_label.Add(label[i]); + new_label.Add(label_array[i]); } int[,] new_seq_array = new int[new_seq.Count, maxlen]; - int[] new_label_array = new int[new_label.Count]; + long[] new_label_array = new long[new_label.Count]; for (var i = 0; i < new_seq.Count; i++) { diff --git a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs index 251eeff9..183544ab 100644 --- a/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs +++ b/test/TensorFlowNET.UnitTest/Dataset/DatasetTest.cs @@ -204,7 +204,7 @@ namespace TensorFlowNET.UnitTest.Dataset { var vocab_size = 20000; // Only consider the top 20k words var maxlen = 200; // Only consider the first 200 words of each movie review - var dataset = keras.datasets.imdb.load_data(num_words: vocab_size); + var dataset = keras.datasets.imdb.load_data(num_words: vocab_size, maxlen: maxlen); var x_train = dataset.Train.Item1; var y_train = dataset.Train.Item2; var x_val = dataset.Test.Item1; @@ -217,16 +217,17 @@ namespace TensorFlowNET.UnitTest.Dataset } IEnumerable RemoveZeros(NDArray data) { + var data_array = (int[,])data.ToMultiDimArray(); List new_data = new List(); - for (var i = 0; i < data.shape[0]; i++) + for (var i = 0; i < data_array.GetLength(0); i++) { List new_array = new List(); - for (var j = 0; j < data.shape[1]; j++) + for (var j = 0; j < data_array.GetLength(1); j++) { - if (data[i][j] == 0) + if (data_array[i, j] == 0) break; else - new_array.Add((int)data[i][j]); + new_array.Add(data_array[i, j]); } new_data.Add(new_array.ToArray()); }