You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_from_dataset.py 5.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Testing from_dataset in mindspore.dataset
  17. """
  18. import numpy as np
  19. import mindspore.dataset as ds
  20. import mindspore.dataset.text as text
  21. def test_demo_basic_from_dataset():
  22. """ this is a tutorial on how from_dataset should be used in a normal use case"""
  23. data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
  24. vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None)
  25. data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
  26. res = []
  27. for d in data.create_dict_iterator():
  28. res.append(d["text"].item())
  29. assert res == [4, 5, 3, 6, 7, 2]
  30. def test_demo_basic_from_dataset_with_tokenizer():
  31. """ this is a tutorial on how from_dataset should be used in a normal use case with tokenizer"""
  32. data = ds.TextFileDataset("../data/dataset/testTokenizerData/1.txt", shuffle=False)
  33. data = data.map(input_columns=["text"], operations=text.UnicodeCharTokenizer())
  34. vocab = text.Vocab.from_dataset(data, None, freq_range=None, top_k=None)
  35. data = data.map(input_columns=["text"], operations=text.Lookup(vocab))
  36. res = []
  37. for d in data.create_dict_iterator():
  38. res.append(list(d["text"]))
  39. assert res == [[13, 3, 7, 14, 9, 17, 3, 2, 19, 9, 2, 11, 3, 4, 16, 4, 8, 6, 5], [21, 20, 10, 25, 23, 26],
  40. [24, 22, 10, 12, 8, 6, 7, 4, 18, 15, 5], [2, 2]]
  41. def test_from_dataset():
  42. """ test build vocab with generator dataset """
  43. def gen_corpus():
  44. # key: word, value: number of occurrences, reason for using letters is so their order is apparent
  45. corpus = {"Z": 4, "Y": 4, "X": 4, "W": 3, "U": 3, "V": 2, "T": 1}
  46. for k, v in corpus.items():
  47. yield (np.array([k] * v, dtype='S'),)
  48. def test_config(freq_range, top_k):
  49. corpus_dataset = ds.GeneratorDataset(gen_corpus, column_names=["text"])
  50. vocab = text.Vocab.from_dataset(corpus_dataset, None, freq_range, top_k)
  51. corpus_dataset = corpus_dataset.map(input_columns="text", operations=text.Lookup(vocab))
  52. res = []
  53. for d in corpus_dataset.create_dict_iterator():
  54. res.append(list(d["text"]))
  55. return res
  56. # take words whose frequency is with in [3,4] order them alphabetically for words with the same frequency
  57. test1_res = test_config(freq_range=(3, 4), top_k=4)
  58. assert test1_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test1_res)
  59. # test words with frequency range [2,inf], only the last word will be filtered out
  60. test2_res = test_config((2, None), None)
  61. assert test2_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [6, 6, 6], [5, 5, 5], [7, 7], [1]], str(test2_res)
  62. # test filter only by top_k
  63. test3_res = test_config(None, 4)
  64. assert test3_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [5, 5, 5], [1, 1], [1]], str(test3_res)
  65. # test filtering out the most frequent
  66. test4_res = test_config((None, 3), 100)
  67. assert test4_res == [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [3, 3, 3], [2, 2, 2], [4, 4], [5]], str(test4_res)
  68. # test top_k == 1
  69. test5_res = test_config(None, 1)
  70. assert test5_res == [[1, 1, 1, 1], [1, 1, 1, 1], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test5_res)
  71. # test min_frequency == max_frequency
  72. test6_res = test_config((4, 4), None)
  73. assert test6_res == [[4, 4, 4, 4], [3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1], [1, 1, 1], [1, 1], [1]], str(test6_res)
  74. def test_from_dataset_exceptions():
  75. """ test various exceptions during that are checked in validator """
  76. def test_config(columns, freq_range, top_k, s):
  77. try:
  78. data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False)
  79. vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k)
  80. assert isinstance(vocab.text.Vocab)
  81. except ValueError as e:
  82. assert s in str(e), str(e)
  83. test_config("text", (), 1, "freq_range needs to be either None or a tuple of 2 integers")
  84. test_config("text", (2, 3), 1.2345, "top_k needs to be a positive integer")
  85. test_config(23, (2, 3), 1.2345, "columns need to be a list of strings")
  86. test_config("text", (100, 1), 12, "frequency range [a,b] should be 0 <= a <= b")
  87. test_config("text", (2, 3), 0, "top_k needs to be a positive integer")
  88. test_config([123], (2, 3), 0, "columns need to be a list of strings")
  89. if __name__ == '__main__':
  90. test_demo_basic_from_dataset()
  91. test_from_dataset()
  92. test_from_dataset_exceptions()
  93. test_demo_basic_from_dataset_with_tokenizer()