You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_datasets_generator.py 34 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999
  1. # Copyright 2019 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import copy
  16. import numpy as np
  17. import pytest
  18. import mindspore.common.dtype as mstype
  19. import mindspore.dataset as ds
  20. import mindspore.dataset.engine.iterators as it
  21. from mindspore import log as logger
  22. from mindspore import Tensor
  23. # Generate 1d int numpy array from 0 - 63
  24. def generator_1d():
  25. for i in range(64):
  26. yield (np.array([i]),)
  27. class DatasetGenerator:
  28. def __init__(self):
  29. pass
  30. def __getitem__(self, item):
  31. return (np.array([item]),)
  32. def __len__(self):
  33. return 10
  34. class DatasetGeneratorLarge:
  35. def __init__(self):
  36. self.data = np.array(range(4000))
  37. def __getitem__(self, item):
  38. return (self.data + item, self.data *10)
  39. def __len__(self):
  40. return 10
  41. def test_generator_0():
  42. """
  43. Test 1D Generator
  44. """
  45. logger.info("Test 1D Generator : 0 - 63")
  46. # apply dataset operations
  47. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  48. i = 0
  49. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  50. golden = np.array([i])
  51. np.testing.assert_array_equal(item["data"], golden)
  52. i = i + 1
  53. # Generate md int numpy array from [[0, 1], [2, 3]] to [[63, 64], [65, 66]]
  54. def generator_md():
  55. for i in range(64):
  56. yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
  57. def test_generator_1():
  58. """
  59. Test MD Generator
  60. """
  61. logger.info("Test MD Generator : 0 - 63, with shape [2, 2]")
  62. # apply dataset operations
  63. data1 = ds.GeneratorDataset(generator_md, ["data"])
  64. i = 0
  65. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  66. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  67. np.testing.assert_array_equal(item["data"], golden)
  68. i = i + 1
  69. # Generate two columns, the first column is from Generator1D, the second column is from GeneratorMD
  70. def generator_mc(maxid=64):
  71. for i in range(maxid):
  72. yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
  73. def test_generator_2():
  74. """
  75. Test multi column generator
  76. """
  77. logger.info("Test multi column generator")
  78. # apply dataset operations
  79. data1 = ds.GeneratorDataset(generator_mc, ["col0", "col1"])
  80. i = 0
  81. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  82. golden = np.array([i])
  83. np.testing.assert_array_equal(item["col0"], golden)
  84. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  85. np.testing.assert_array_equal(item["col1"], golden)
  86. i = i + 1
  87. def test_generator_3():
  88. """
  89. Test 1D Generator + repeat(4)
  90. """
  91. logger.info("Test 1D Generator : 0 - 63 + Repeat(4)")
  92. # apply dataset operations
  93. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  94. data1 = data1.repeat(4)
  95. i = 0
  96. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  97. golden = np.array([i])
  98. np.testing.assert_array_equal(item["data"], golden)
  99. i = i + 1
  100. if i == 64:
  101. i = 0
  102. def test_generator_4():
  103. """
  104. Test fixed size 1D Generator + batch
  105. """
  106. logger.info("Test 1D Generator : 0 - 63 + batch(4)")
  107. # apply dataset operations
  108. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  109. data1 = data1.batch(4)
  110. i = 0
  111. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  112. golden = np.array([[i], [i + 1], [i + 2], [i + 3]])
  113. np.testing.assert_array_equal(item["data"], golden)
  114. i = i + 4
  115. def generator_with_type(t):
  116. for i in range(64):
  117. yield (np.array([i], dtype=t),)
  118. def type_tester(t):
  119. logger.info("Test with Type {}".format(t.__name__))
  120. # apply dataset operations
  121. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"])
  122. data1 = data1.batch(4)
  123. i = 0
  124. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  125. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  126. np.testing.assert_array_equal(item["data"], golden)
  127. i = i + 4
  128. def test_generator_5():
  129. """
  130. Test 1D Generator on different data type
  131. """
  132. logger.info("Test 1D Generator on all data types")
  133. types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64]
  134. for t in types:
  135. type_tester(t)
  136. def type_tester_with_type_check(t, c):
  137. logger.info("Test with Type {}".format(t.__name__))
  138. # apply dataset operations
  139. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"], column_types=[c])
  140. data1 = data1.batch(4)
  141. i = 0
  142. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  143. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  144. np.testing.assert_array_equal(item["data"], golden)
  145. i = i + 4
  146. def test_generator_6():
  147. """
  148. Test 1D Generator on different data type with type check
  149. """
  150. logger.info("Test 1D Generator on all data types with type check")
  151. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  152. np.float64]
  153. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  154. mstype.uint64, mstype.float32, mstype.float64]
  155. for i, _ in enumerate(np_types):
  156. type_tester_with_type_check(np_types[i], de_types[i])
  157. def generator_with_type_2c(t):
  158. for i in range(64):
  159. yield (np.array([i], dtype=t), np.array([i], dtype=t))
  160. def type_tester_with_type_check_2c(t, c):
  161. logger.info("Test with Type {}".format(t.__name__))
  162. # apply dataset operations
  163. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), ["data0", "data1"], column_types=c)
  164. data1 = data1.batch(4)
  165. i = 0
  166. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  167. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  168. np.testing.assert_array_equal(item["data0"], golden)
  169. i = i + 4
  170. def test_generator_7():
  171. """
  172. Test 2 column Generator on different data type with type check
  173. """
  174. logger.info("Test 2 column Generator on all data types with type check")
  175. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  176. np.float64]
  177. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  178. mstype.uint64, mstype.float32, mstype.float64]
  179. for i, _ in enumerate(np_types):
  180. type_tester_with_type_check_2c(np_types[i], [None, de_types[i]])
  181. def test_generator_8():
  182. """
  183. Test multi column generator with few mapops
  184. """
  185. logger.info("Test multi column generator with mapops to check the order too")
  186. # apply dataset operations
  187. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  188. data1 = data1.map(operations=(lambda x: x * 3), input_columns="col0", output_columns="out0",
  189. num_parallel_workers=2)
  190. data1 = data1.map(operations=(lambda x: (x * 7, x)), input_columns="col1", output_columns=["out1", "out2"],
  191. num_parallel_workers=2, column_order=["out0", "out1", "out2"])
  192. data1 = data1.map(operations=(lambda x: x + 1), input_columns="out2", output_columns="out2",
  193. num_parallel_workers=2)
  194. i = 0
  195. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  196. golden = np.array([i * 3])
  197. np.testing.assert_array_equal(item["out0"], golden)
  198. golden = np.array([[i * 7, (i + 1) * 7], [(i + 2) * 7, (i + 3) * 7]])
  199. np.testing.assert_array_equal(item["out1"], golden)
  200. golden = np.array([[i + 1, i + 2], [i + 3, i + 4]])
  201. np.testing.assert_array_equal(item["out2"], golden)
  202. i = i + 1
  203. def test_generator_9():
  204. """
  205. Test map column order when len(input_columns) == len(output_columns).
  206. """
  207. logger.info("Test map column order when len(input_columns) == len(output_columns).")
  208. # apply dataset operations
  209. data1 = ds.GeneratorDataset(generator_mc(2048), ["image", "label"])
  210. data2 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  211. data1 = data1.map(operations=(lambda x: x * 3), input_columns="label",
  212. num_parallel_workers=4)
  213. data2 = data2.map(operations=(lambda x: x * 3), input_columns="label",
  214. num_parallel_workers=4)
  215. # Expected column order is not changed.
  216. # data1 = data[0] is "image" and data[1] is "label"
  217. # data2 = data[0] is "label" and data[1] is "image"
  218. i = 0
  219. for data1, data2 in zip(data1, data2): # each data is a dictionary
  220. golden = np.array([i])
  221. np.testing.assert_array_equal(data1[0].asnumpy(), golden)
  222. golden = np.array([[i * 3, (i + 1) * 3], [(i + 2) * 3, (i + 3) * 3]])
  223. np.testing.assert_array_equal(data1[1].asnumpy(), golden)
  224. golden = np.array([i * 3])
  225. np.testing.assert_array_equal(data2[0].asnumpy(), golden)
  226. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  227. np.testing.assert_array_equal(data2[1].asnumpy(), golden)
  228. i = i + 1
  229. def test_generator_10():
  230. """
  231. Test map column order when len(input_columns) != len(output_columns).
  232. """
  233. logger.info("Test map column order when len(input_columns) != len(output_columns).")
  234. # apply dataset operations
  235. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  236. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  237. column_order=['col0', 'out1', 'out2'], num_parallel_workers=2)
  238. # Expected column order is |col0|out1|out2|
  239. i = 0
  240. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  241. golden = np.array([i])
  242. np.testing.assert_array_equal(item[0], golden)
  243. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  244. np.testing.assert_array_equal(item[1], golden)
  245. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  246. np.testing.assert_array_equal(item[2], golden)
  247. i = i + 1
  248. def test_generator_11():
  249. """
  250. Test map column order when len(input_columns) != len(output_columns).
  251. """
  252. logger.info("Test map column order when len(input_columns) != len(output_columns), "
  253. "and column_order drops some columns.")
  254. # apply dataset operations
  255. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  256. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  257. column_order=['out1', 'out2'], num_parallel_workers=2)
  258. # Expected column order is |out1|out2|
  259. i = 0
  260. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  261. # len should be 2 because col0 is dropped (not included in column_order)
  262. assert len(item) == 2
  263. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  264. np.testing.assert_array_equal(item[0], golden)
  265. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  266. np.testing.assert_array_equal(item[1], golden)
  267. i = i + 1
  268. def test_generator_12():
  269. """
  270. Test map column order when input_columns and output_columns are None.
  271. """
  272. logger.info("Test map column order when input_columns and output_columns are None.")
  273. # apply dataset operations
  274. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  275. data1 = data1.map(operations=(lambda x: (x * 5)), num_parallel_workers=2)
  276. # Expected column order is |col0|col1|
  277. i = 0
  278. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  279. assert len(item) == 2
  280. golden = np.array([i * 5])
  281. np.testing.assert_array_equal(item[0], golden)
  282. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  283. np.testing.assert_array_equal(item[1], golden)
  284. i = i + 1
  285. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  286. data1 = data1.map(operations=(lambda x: (x * 5)), column_order=["col1", "col0"], num_parallel_workers=2)
  287. # Expected column order is |col0|col1|
  288. i = 0
  289. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  290. assert len(item) == 2
  291. golden = np.array([i * 5])
  292. np.testing.assert_array_equal(item[1], golden)
  293. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  294. np.testing.assert_array_equal(item[0], golden)
  295. i = i + 1
  296. def test_generator_13():
  297. """
  298. Test map column order when input_columns is None.
  299. """
  300. logger.info("Test map column order when input_columns is None.")
  301. # apply dataset operations
  302. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  303. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2)
  304. # Expected column order is |out0|col1|
  305. i = 0
  306. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  307. assert len(item) == 2
  308. golden = np.array([i * 5])
  309. np.testing.assert_array_equal(item[0], golden)
  310. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  311. np.testing.assert_array_equal(item[1], golden)
  312. i = i + 1
  313. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  314. # len should be 2 because col0 is dropped (not included in column_order)
  315. assert len(item) == 2
  316. golden = np.array([i * 5])
  317. np.testing.assert_array_equal(item["out0"], golden)
  318. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  319. np.testing.assert_array_equal(item["col1"], golden)
  320. i = i + 1
  321. def test_generator_14():
  322. """
  323. Test 1D Generator MP + CPP sampler
  324. """
  325. logger.info("Test 1D Generator MP : 0 - 63")
  326. # Sometimes there are some ITERATORS left in ITERATORS_LIST when run all UTs together,
  327. # and cause core dump and blocking in this UT. Add cleanup() here to fix it.
  328. it._cleanup() # pylint: disable=W0212
  329. # Reduce memory needed by reducing queue size
  330. prefetch_original = ds.config.get_prefetch_size()
  331. ds.config.set_prefetch_size(1)
  332. source = [(np.array([x]),) for x in range(256)]
  333. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(),
  334. num_parallel_workers=4, max_rowsize=1).repeat(2)
  335. i = 0
  336. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  337. golden = np.array([i])
  338. np.testing.assert_array_equal(data["data"], golden)
  339. i = i + 1
  340. if i == 256:
  341. i = 0
  342. ds.config.set_prefetch_size(prefetch_original)
  343. def test_generator_15():
  344. """
  345. Test 1D Generator MP + Python sampler
  346. """
  347. logger.info("Test 1D Generator MP : 0 - 63")
  348. ## Reduce memory needed by reducing queue size
  349. prefetch_original = ds.config.get_prefetch_size()
  350. ds.config.set_prefetch_size(1)
  351. sampler = [x for x in range(256)]
  352. source = [(np.array([x]),) for x in range(256)]
  353. ds1 = ds.GeneratorDataset(source, ["data"], sampler=sampler,
  354. num_parallel_workers=4, max_rowsize=1).repeat(1)
  355. i = 0
  356. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  357. golden = np.array([i])
  358. np.testing.assert_array_equal(data["data"], golden)
  359. i = i + 1
  360. if i == 256:
  361. i = 0
  362. ds.config.set_prefetch_size(prefetch_original)
  363. def test_generator_16():
  364. """
  365. Test multi column generator Mp + CPP sampler
  366. """
  367. logger.info("Test multi column generator")
  368. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  369. # apply dataset operations
  370. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler())
  371. i = 0
  372. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  373. golden = np.array([i])
  374. np.testing.assert_array_equal(item["col0"], golden)
  375. golden = np.array([i + 1])
  376. np.testing.assert_array_equal(item["col1"], golden)
  377. i = i + 1
  378. def test_generator_17():
  379. """
  380. Test multi column generator Mp + Python sampler
  381. """
  382. logger.info("Test multi column generator")
  383. sampler = [x for x in range(256)]
  384. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  385. # apply dataset operations
  386. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=sampler)
  387. i = 0
  388. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  389. golden = np.array([i])
  390. np.testing.assert_array_equal(item["col0"], golden)
  391. golden = np.array([i + 1])
  392. np.testing.assert_array_equal(item["col1"], golden)
  393. i = i + 1
  394. def test_generator_18():
  395. """
  396. Test multiprocessing flag (same as test 13 with python_multiprocessing=True flag)
  397. """
  398. logger.info("Test map column order when input_columns is None.")
  399. # Reduce shm usage by disabling this optimization
  400. mem_original = ds.config.get_enable_shared_mem()
  401. ds.config.set_enable_shared_mem(False)
  402. # apply dataset operations
  403. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"], python_multiprocessing=True)
  404. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2,
  405. python_multiprocessing=True)
  406. # Expected column order is |out0|col1|
  407. i = 0
  408. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  409. assert len(item) == 2
  410. golden = np.array([i * 5])
  411. np.testing.assert_array_equal(item[0], golden)
  412. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  413. np.testing.assert_array_equal(item[1], golden)
  414. i = i + 1
  415. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  416. # len should be 2 because col0 is dropped (not included in column_order)
  417. assert len(item) == 2
  418. golden = np.array([i * 5])
  419. np.testing.assert_array_equal(item["out0"], golden)
  420. ds.config.set_enable_shared_mem(mem_original)
  421. def test_generator_19():
  422. """
  423. Test multiprocessing flag with 2 different large columns
  424. """
  425. logger.info("Test map column order when input_columns is None.")
  426. # apply dataset operations
  427. data1 = ds.GeneratorDataset(DatasetGeneratorLarge(), ["col0", "col1"], python_multiprocessing=True, shuffle=False)
  428. # Expected column order is |out0|col1|
  429. i = 0
  430. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  431. assert len(item) == 2
  432. golden = np.array(range(4000)) + i
  433. np.testing.assert_array_equal(item[0], golden)
  434. golden = np.array(range(4000)) * 10
  435. np.testing.assert_array_equal(item[1], golden)
  436. i = i + 1
  437. class RandomAccessDataset:
  438. def __init__(self):
  439. self.__data = np.random.sample((5, 1))
  440. def __getitem__(self, item):
  441. return self.__data[item]
  442. def __len__(self):
  443. return 5
  444. class RandomAccessDatasetWithoutLen:
  445. def __init__(self):
  446. self.__data = np.random.sample((5, 1))
  447. def __getitem__(self, item):
  448. return self.__data[item]
  449. class IterableDataset:
  450. def __init__(self):
  451. self.count = 0
  452. self.max = 10
  453. def __iter__(self):
  454. return self
  455. def __next__(self):
  456. if self.count >= self.max:
  457. raise StopIteration
  458. self.count += 1
  459. return (np.array(self.count),)
  460. def test_generator_20():
  461. """
  462. Test mappable and unmappable dataset as source for GeneratorDataset.
  463. """
  464. logger.info("Test mappable and unmappable dataset as source for GeneratorDataset.")
  465. # Mappable dataset
  466. data1 = ds.GeneratorDataset(RandomAccessDataset(), ["col0"])
  467. dataset_size1 = data1.get_dataset_size()
  468. assert dataset_size1 == 5
  469. # Mappable dataset without __len__
  470. data2 = ds.GeneratorDataset(RandomAccessDatasetWithoutLen(), ["col0"])
  471. try:
  472. data2.get_dataset_size()
  473. except RuntimeError as e:
  474. assert "'__len__' method is required" in str(e)
  475. # Unmappable dataset
  476. data3 = ds.GeneratorDataset(IterableDataset(), ["col0"])
  477. dataset_size3 = data3.get_dataset_size()
  478. assert dataset_size3 == 10
  479. def test_generator_error_1():
  480. def generator_np():
  481. for i in range(64):
  482. yield (np.array([{i}]),)
  483. with pytest.raises(RuntimeError) as info:
  484. data1 = ds.GeneratorDataset(generator_np, ["data"])
  485. for _ in data1:
  486. pass
  487. assert "Invalid data type" in str(info.value)
  488. def test_generator_error_2():
  489. def generator_np():
  490. for i in range(64):
  491. yield ({i},)
  492. with pytest.raises(RuntimeError) as info:
  493. data1 = ds.GeneratorDataset(generator_np, ["data"])
  494. for _ in data1:
  495. pass
  496. print("========", str(info.value))
  497. assert "'GeneratorDataset' should return a tuple of NumPy arrays" in str(info.value)
  498. def test_generator_error_3():
  499. with pytest.raises(ValueError) as info:
  500. # apply dataset operations
  501. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  502. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"], output_columns=["out1", "out2"],
  503. num_parallel_workers=2)
  504. for _ in data1:
  505. pass
  506. assert "When length of input_columns and output_columns are not equal, column_order must be specified." in \
  507. str(info.value)
  508. def test_generator_error_4():
  509. with pytest.raises(RuntimeError) as info:
  510. # apply dataset operations
  511. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  512. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"],
  513. num_parallel_workers=2)
  514. for _ in data1:
  515. pass
  516. assert "the number of columns returned in 'map' operations should match the number of 'output_columns'"\
  517. in str(info.value)
  518. def test_generator_sequential_sampler():
  519. source = [(np.array([x]),) for x in range(64)]
  520. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
  521. i = 0
  522. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  523. golden = np.array([i])
  524. np.testing.assert_array_equal(data["data"], golden)
  525. i = i + 1
  526. def test_generator_random_sampler():
  527. source = [(np.array([x]),) for x in range(64)]
  528. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=True)
  529. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  530. pass
  531. def test_generator_distributed_sampler():
  532. source = [(np.array([x]),) for x in range(64)]
  533. for sid in range(8):
  534. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=False, num_shards=8, shard_id=sid)
  535. i = sid
  536. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  537. golden = np.array([i])
  538. np.testing.assert_array_equal(data["data"], golden)
  539. i = i + 8
  540. def test_generator_num_samples():
  541. source = [(np.array([x]),) for x in range(64)]
  542. num_samples = 32
  543. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples))
  544. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples)
  545. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  546. count = 0
  547. for _ in ds1.create_dict_iterator(num_epochs=1):
  548. count = count + 1
  549. assert count == num_samples
  550. count = 0
  551. for _ in ds2.create_dict_iterator(num_epochs=1):
  552. count = count + 1
  553. assert count == num_samples
  554. count = 0
  555. for _ in ds3.create_dict_iterator(num_epochs=1):
  556. count = count + 1
  557. assert count == num_samples
  558. def test_generator_num_samples_underflow():
  559. source = [(np.array([x]),) for x in range(64)]
  560. num_samples = 256
  561. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(64)], num_samples=num_samples)
  562. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  563. count = 0
  564. for _ in ds2.create_dict_iterator(num_epochs=1):
  565. count = count + 1
  566. assert count == 64
  567. count = 0
  568. for _ in ds3.create_dict_iterator(num_epochs=1):
  569. count = count + 1
  570. assert count == 64
  571. def type_tester_with_type_check_2c_schema(t, c):
  572. logger.info("Test with Type {}".format(t.__name__))
  573. schema = ds.Schema()
  574. schema.add_column("data0", c[0])
  575. schema.add_column("data1", c[1])
  576. # apply dataset operations
  577. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema)
  578. data1 = data1.batch(4)
  579. i = 0
  580. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  581. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  582. np.testing.assert_array_equal(item["data0"], golden)
  583. i = i + 4
  584. def test_generator_schema():
  585. """
  586. Test 2 column Generator on different data type with type check with schema input
  587. """
  588. logger.info("Test 2 column Generator on all data types with type check")
  589. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  590. np.float64]
  591. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  592. mstype.uint64, mstype.float32, mstype.float64]
  593. for i, _ in enumerate(np_types):
  594. type_tester_with_type_check_2c_schema(np_types[i], [de_types[i], de_types[i]])
  595. def test_generator_dataset_size_0():
  596. """
  597. Test GeneratorDataset get_dataset_size by iterator method.
  598. """
  599. logger.info("Test 1D Generator : 0 - 63 get_dataset_size")
  600. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  601. data_size = data1.get_dataset_size()
  602. num_rows = 0
  603. for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  604. num_rows = num_rows + 1
  605. assert data_size == num_rows
  606. def test_generator_dataset_size_1():
  607. """
  608. Test GeneratorDataset get_dataset_size by __len__ method.
  609. """
  610. logger.info("Test DatasetGenerator get_dataset_size")
  611. dataset_generator = DatasetGenerator()
  612. data1 = ds.GeneratorDataset(dataset_generator, ["data"])
  613. data_size = data1.get_dataset_size()
  614. num_rows = 0
  615. for _ in data1.create_dict_iterator(num_epochs=1):
  616. num_rows = num_rows + 1
  617. assert data_size == num_rows
  618. def test_generator_dataset_size_2():
  619. """
  620. Test GeneratorDataset + repeat get_dataset_size
  621. """
  622. logger.info("Test 1D Generator + repeat get_dataset_size")
  623. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  624. data1 = data1.repeat(2)
  625. data_size = data1.get_dataset_size()
  626. num_rows = 0
  627. for _ in data1.create_dict_iterator(num_epochs=1):
  628. num_rows = num_rows + 1
  629. assert data_size == num_rows
  630. def test_generator_dataset_size_3():
  631. """
  632. Test GeneratorDataset + batch get_dataset_size
  633. """
  634. logger.info("Test 1D Generator + batch get_dataset_size")
  635. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  636. data1 = data1.batch(4)
  637. data_size = data1.get_dataset_size()
  638. num_rows = 0
  639. for _ in data1.create_dict_iterator(num_epochs=1):
  640. num_rows += 1
  641. assert data_size == num_rows
  642. def test_generator_dataset_size_4():
  643. """
  644. Test GeneratorDataset + num_shards
  645. """
  646. logger.info("Test 1D Generator : 0 - 63 + num_shards get_dataset_size")
  647. dataset_generator = DatasetGenerator()
  648. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  649. data_size = data1.get_dataset_size()
  650. num_rows = 0
  651. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  652. num_rows = num_rows + 1
  653. assert data_size == num_rows
  654. def test_generator_dataset_size_5():
  655. """
  656. Test get_dataset_size after create_dict_iterator
  657. """
  658. logger.info("Test get_dataset_size after create_dict_iterator")
  659. dataset_generator = DatasetGenerator()
  660. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  661. num_rows = 0
  662. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  663. num_rows = num_rows + 1
  664. data_size = data1.get_dataset_size()
  665. assert data_size == num_rows
  666. def manual_test_generator_keyboard_interrupt():
  667. """
  668. Test keyboard_interrupt
  669. """
  670. logger.info("Test 1D Generator MP : 0 - 63")
  671. class MyDS():
  672. def __getitem__(self, item):
  673. while True:
  674. pass
  675. def __len__(self):
  676. return 1024
  677. ds1 = ds.GeneratorDataset(MyDS(), ["data"], num_parallel_workers=4).repeat(2)
  678. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  679. pass
  680. def test_explicit_deepcopy():
  681. """
  682. Test explicit_deepcopy
  683. """
  684. logger.info("Test explicit_deepcopy")
  685. ds1 = ds.NumpySlicesDataset([1, 2], shuffle=False)
  686. ds2 = copy.deepcopy(ds1)
  687. for d1, d2 in zip(ds1, ds2):
  688. assert d1 == d2
  689. def test_func_generator_dataset_005():
  690. """
  691. generator: class __getitem__
  692. """
  693. result = [np.random.randn(242, 242, 242), np.random.randn(42, 24, 442)]
  694. class MyData():
  695. def __init__(self, input_para):
  696. self.data = input_para
  697. def __getitem__(self, item):
  698. return (Tensor(self.data[0]), Tensor(self.data[1]))
  699. def __len__(self):
  700. return 2
  701. column_names = ["col1", "col2"]
  702. dataset = ds.GeneratorDataset(MyData(result), column_names)
  703. i = 0
  704. for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
  705. assert "col1" in str(data.keys())
  706. assert (data["col1"] == result[0]).all()
  707. assert (data["col2"] == result[1]).all()
  708. i += 1
  709. assert i == 2
  710. def test_func_generator_dataset_with_zip_source():
  711. """
  712. Feature: verify the source is zip
  713. Description: the source input is zip
  714. Expectation: success
  715. """
  716. def synthetic_data(w, b, num_examples):
  717. """生成 y = Xw + b + 噪声。"""
  718. X = np.random.normal(0, 1, (num_examples, len(w)))
  719. y = np.matmul(X, w) + b
  720. y += np.random.normal(0, 0.01, y.shape)
  721. return X.astype(np.float32), y.reshape((-1, 1)).astype(np.float32)
  722. true_w = np.array([2, -3.4])
  723. true_b = 4.2
  724. features, labels = synthetic_data(true_w, true_b, 10)
  725. def load_array(data_arrays, column_names, batch_size, is_train=True):
  726. """构造一个MindSpore数据迭代器。"""
  727. dataset = ds.GeneratorDataset(data_arrays, column_names, shuffle=is_train)
  728. dataset = dataset.batch(batch_size)
  729. return dataset
  730. batch_size = 2
  731. dataset = load_array(zip(features, labels), ['features', 'labels'], batch_size)
  732. count = 0
  733. epochs = 10
  734. dataset_iter = dataset.create_dict_iterator(num_epochs=epochs, output_numpy=True)
  735. for _ in range(epochs):
  736. for _ in dataset_iter:
  737. count += 1
  738. assert count == 50
  739. if __name__ == "__main__":
  740. test_generator_0()
  741. test_generator_1()
  742. test_generator_2()
  743. test_generator_3()
  744. test_generator_4()
  745. test_generator_5()
  746. test_generator_6()
  747. test_generator_7()
  748. test_generator_8()
  749. test_generator_9()
  750. test_generator_10()
  751. test_generator_11()
  752. test_generator_12()
  753. test_generator_13()
  754. test_generator_14()
  755. test_generator_15()
  756. test_generator_16()
  757. test_generator_17()
  758. test_generator_18()
  759. test_generator_19()
  760. test_generator_error_1()
  761. test_generator_error_2()
  762. test_generator_error_3()
  763. test_generator_error_4()
  764. test_generator_sequential_sampler()
  765. test_generator_distributed_sampler()
  766. test_generator_random_sampler()
  767. test_generator_num_samples()
  768. test_generator_num_samples_underflow()
  769. test_generator_schema()
  770. test_generator_dataset_size_0()
  771. test_generator_dataset_size_1()
  772. test_generator_dataset_size_2()
  773. test_generator_dataset_size_3()
  774. test_generator_dataset_size_4()
  775. test_generator_dataset_size_5()
  776. test_explicit_deepcopy()
  777. test_func_generator_dataset_005()
  778. test_func_generator_dataset_with_zip_source()