You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_datasets_textfileop.py 9.0 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import pytest
  16. import mindspore.dataset as ds
  17. from mindspore import log as logger
  18. from util import config_get_set_num_parallel_workers, config_get_set_seed
  19. DATA_FILE = "../data/dataset/testTextFileDataset/1.txt"
  20. DATA_ALL_FILE = "../data/dataset/testTextFileDataset/*"
  21. def test_textline_dataset_one_file():
  22. data = ds.TextFileDataset(DATA_FILE)
  23. count = 0
  24. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  25. logger.info("{}".format(i["text"]))
  26. count += 1
  27. assert count == 3
  28. def test_textline_dataset_all_file():
  29. data = ds.TextFileDataset(DATA_ALL_FILE)
  30. count = 0
  31. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  32. logger.info("{}".format(i["text"]))
  33. count += 1
  34. assert count == 5
  35. def test_textline_dataset_num_samples_none():
  36. # Do not provide a num_samples argument, so it would be None by default
  37. data = ds.TextFileDataset(DATA_FILE)
  38. count = 0
  39. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  40. logger.info("{}".format(i["text"]))
  41. count += 1
  42. assert count == 3
  43. def test_textline_dataset_shuffle_false4():
  44. original_num_parallel_workers = config_get_set_num_parallel_workers(4)
  45. original_seed = config_get_set_seed(987)
  46. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=False)
  47. count = 0
  48. line = ["This is a text file.", "Another file.",
  49. "Be happy every day.", "End of file.", "Good luck to everyone."]
  50. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  51. strs = i["text"].item().decode("utf8")
  52. assert strs == line[count]
  53. count += 1
  54. assert count == 5
  55. # Restore configuration
  56. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  57. ds.config.set_seed(original_seed)
  58. def test_textline_dataset_shuffle_false1():
  59. original_num_parallel_workers = config_get_set_num_parallel_workers(1)
  60. original_seed = config_get_set_seed(987)
  61. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=False)
  62. count = 0
  63. line = ["This is a text file.", "Be happy every day.", "Good luck to everyone.",
  64. "Another file.", "End of file."]
  65. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  66. strs = i["text"].item().decode("utf8")
  67. assert strs == line[count]
  68. count += 1
  69. assert count == 5
  70. # Restore configuration
  71. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  72. ds.config.set_seed(original_seed)
  73. def test_textline_dataset_shuffle_files4():
  74. original_num_parallel_workers = config_get_set_num_parallel_workers(4)
  75. original_seed = config_get_set_seed(135)
  76. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.FILES)
  77. count = 0
  78. line = ["This is a text file.", "Another file.",
  79. "Be happy every day.", "End of file.", "Good luck to everyone."]
  80. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  81. strs = i["text"].item().decode("utf8")
  82. assert strs == line[count]
  83. count += 1
  84. assert count == 5
  85. # Restore configuration
  86. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  87. ds.config.set_seed(original_seed)
  88. def test_textline_dataset_shuffle_files1():
  89. original_num_parallel_workers = config_get_set_num_parallel_workers(1)
  90. original_seed = config_get_set_seed(135)
  91. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.FILES)
  92. count = 0
  93. line = ["This is a text file.", "Be happy every day.", "Good luck to everyone.",
  94. "Another file.", "End of file."]
  95. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  96. strs = i["text"].item().decode("utf8")
  97. assert strs == line[count]
  98. count += 1
  99. assert count == 5
  100. # Restore configuration
  101. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  102. ds.config.set_seed(original_seed)
  103. def test_textline_dataset_shuffle_global4():
  104. original_num_parallel_workers = config_get_set_num_parallel_workers(4)
  105. original_seed = config_get_set_seed(246)
  106. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.GLOBAL)
  107. count = 0
  108. line = ["Another file.", "Good luck to everyone.", "End of file.",
  109. "This is a text file.", "Be happy every day."]
  110. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  111. strs = i["text"].item().decode("utf8")
  112. assert strs == line[count]
  113. count += 1
  114. assert count == 5
  115. # Restore configuration
  116. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  117. ds.config.set_seed(original_seed)
  118. def test_textline_dataset_shuffle_global1():
  119. original_num_parallel_workers = config_get_set_num_parallel_workers(1)
  120. original_seed = config_get_set_seed(246)
  121. data = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.GLOBAL)
  122. count = 0
  123. line = ["Another file.", "Good luck to everyone.", "This is a text file.",
  124. "End of file.", "Be happy every day."]
  125. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  126. strs = i["text"].item().decode("utf8")
  127. assert strs == line[count]
  128. count += 1
  129. assert count == 5
  130. # Restore configuration
  131. ds.config.set_num_parallel_workers(original_num_parallel_workers)
  132. ds.config.set_seed(original_seed)
  133. def test_textline_dataset_num_samples():
  134. data = ds.TextFileDataset(DATA_FILE, num_samples=2)
  135. count = 0
  136. for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  137. count += 1
  138. assert count == 2
  139. def test_textline_dataset_distribution():
  140. data = ds.TextFileDataset(DATA_ALL_FILE, num_shards=2, shard_id=1)
  141. count = 0
  142. for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  143. count += 1
  144. assert count == 3
  145. def test_textline_dataset_repeat():
  146. data = ds.TextFileDataset(DATA_FILE, shuffle=False)
  147. data = data.repeat(3)
  148. count = 0
  149. line = ["This is a text file.", "Be happy every day.", "Good luck to everyone.",
  150. "This is a text file.", "Be happy every day.", "Good luck to everyone.",
  151. "This is a text file.", "Be happy every day.", "Good luck to everyone."]
  152. for i in data.create_dict_iterator(num_epochs=1, output_numpy=True):
  153. strs = i["text"].item().decode("utf8")
  154. assert strs == line[count]
  155. count += 1
  156. assert count == 9
  157. def test_textline_dataset_get_datasetsize():
  158. data = ds.TextFileDataset(DATA_FILE)
  159. size = data.get_dataset_size()
  160. assert size == 3
  161. def test_textline_dataset_to_device():
  162. data = ds.TextFileDataset(DATA_FILE, shuffle=False)
  163. data = data.to_device()
  164. data.send()
  165. def test_textline_dataset_exceptions():
  166. with pytest.raises(ValueError) as error_info:
  167. _ = ds.TextFileDataset(DATA_FILE, num_samples=-1)
  168. assert "num_samples exceeds the boundary" in str(error_info.value)
  169. with pytest.raises(ValueError) as error_info:
  170. _ = ds.TextFileDataset("does/not/exist/no.txt")
  171. assert "The following patterns did not match any files" in str(error_info.value)
  172. with pytest.raises(ValueError) as error_info:
  173. _ = ds.TextFileDataset("")
  174. assert "The following patterns did not match any files" in str(error_info.value)
  175. def exception_func(item):
  176. raise Exception("Error occur!")
  177. with pytest.raises(RuntimeError) as error_info:
  178. data = ds.TextFileDataset(DATA_FILE)
  179. data = data.map(operations=exception_func, input_columns=["text"], num_parallel_workers=1)
  180. for _ in data.__iter__():
  181. pass
  182. assert "map operation: [PyFunc] failed. The corresponding data files" in str(error_info.value)
  183. if __name__ == "__main__":
  184. test_textline_dataset_one_file()
  185. test_textline_dataset_all_file()
  186. test_textline_dataset_num_samples_none()
  187. test_textline_dataset_shuffle_false4()
  188. test_textline_dataset_shuffle_false1()
  189. test_textline_dataset_shuffle_files4()
  190. test_textline_dataset_shuffle_files1()
  191. test_textline_dataset_shuffle_global4()
  192. test_textline_dataset_shuffle_global1()
  193. test_textline_dataset_num_samples()
  194. test_textline_dataset_distribution()
  195. test_textline_dataset_repeat()
  196. test_textline_dataset_get_datasetsize()
  197. test_textline_dataset_to_device()
  198. test_textline_dataset_exceptions()