You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_datasets_generator.py 29 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. # Copyright 2019 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import copy
  16. import numpy as np
  17. import pytest
  18. import mindspore.common.dtype as mstype
  19. import mindspore.dataset as ds
  20. from mindspore import log as logger
  21. # Generate 1d int numpy array from 0 - 63
  22. def generator_1d():
  23. for i in range(64):
  24. yield (np.array([i]),)
  25. class DatasetGenerator:
  26. def __init__(self):
  27. pass
  28. def __getitem__(self, item):
  29. return (np.array([item]),)
  30. def __len__(self):
  31. return 10
  32. class DatasetGeneratorLarge:
  33. def __init__(self):
  34. self.data = np.array(range(4000))
  35. def __getitem__(self, item):
  36. return (self.data + item, self.data *10)
  37. def __len__(self):
  38. return 10
  39. def test_generator_0():
  40. """
  41. Test 1D Generator
  42. """
  43. logger.info("Test 1D Generator : 0 - 63")
  44. # apply dataset operations
  45. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  46. i = 0
  47. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  48. golden = np.array([i])
  49. np.testing.assert_array_equal(item["data"], golden)
  50. i = i + 1
  51. # Generate md int numpy array from [[0, 1], [2, 3]] to [[63, 64], [65, 66]]
  52. def generator_md():
  53. for i in range(64):
  54. yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
  55. def test_generator_1():
  56. """
  57. Test MD Generator
  58. """
  59. logger.info("Test MD Generator : 0 - 63, with shape [2, 2]")
  60. # apply dataset operations
  61. data1 = ds.GeneratorDataset(generator_md, ["data"])
  62. i = 0
  63. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  64. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  65. np.testing.assert_array_equal(item["data"], golden)
  66. i = i + 1
  67. # Generate two columns, the first column is from Generator1D, the second column is from GeneratorMD
  68. def generator_mc(maxid=64):
  69. for i in range(maxid):
  70. yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
  71. def test_generator_2():
  72. """
  73. Test multi column generator
  74. """
  75. logger.info("Test multi column generator")
  76. # apply dataset operations
  77. data1 = ds.GeneratorDataset(generator_mc, ["col0", "col1"])
  78. i = 0
  79. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  80. golden = np.array([i])
  81. np.testing.assert_array_equal(item["col0"], golden)
  82. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  83. np.testing.assert_array_equal(item["col1"], golden)
  84. i = i + 1
  85. def test_generator_3():
  86. """
  87. Test 1D Generator + repeat(4)
  88. """
  89. logger.info("Test 1D Generator : 0 - 63 + Repeat(4)")
  90. # apply dataset operations
  91. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  92. data1 = data1.repeat(4)
  93. i = 0
  94. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  95. golden = np.array([i])
  96. np.testing.assert_array_equal(item["data"], golden)
  97. i = i + 1
  98. if i == 64:
  99. i = 0
  100. def test_generator_4():
  101. """
  102. Test fixed size 1D Generator + batch
  103. """
  104. logger.info("Test 1D Generator : 0 - 63 + batch(4)")
  105. # apply dataset operations
  106. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  107. data1 = data1.batch(4)
  108. i = 0
  109. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  110. golden = np.array([[i], [i + 1], [i + 2], [i + 3]])
  111. np.testing.assert_array_equal(item["data"], golden)
  112. i = i + 4
  113. def generator_with_type(t):
  114. for i in range(64):
  115. yield (np.array([i], dtype=t),)
  116. def type_tester(t):
  117. logger.info("Test with Type {}".format(t.__name__))
  118. # apply dataset operations
  119. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"])
  120. data1 = data1.batch(4)
  121. i = 0
  122. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  123. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  124. np.testing.assert_array_equal(item["data"], golden)
  125. i = i + 4
  126. def test_generator_5():
  127. """
  128. Test 1D Generator on different data type
  129. """
  130. logger.info("Test 1D Generator on all data types")
  131. types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64]
  132. for t in types:
  133. type_tester(t)
  134. def type_tester_with_type_check(t, c):
  135. logger.info("Test with Type {}".format(t.__name__))
  136. # apply dataset operations
  137. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"], column_types=[c])
  138. data1 = data1.batch(4)
  139. i = 0
  140. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  141. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  142. np.testing.assert_array_equal(item["data"], golden)
  143. i = i + 4
  144. def test_generator_6():
  145. """
  146. Test 1D Generator on different data type with type check
  147. """
  148. logger.info("Test 1D Generator on all data types with type check")
  149. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  150. np.float64]
  151. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  152. mstype.uint64, mstype.float32, mstype.float64]
  153. for i, _ in enumerate(np_types):
  154. type_tester_with_type_check(np_types[i], de_types[i])
  155. def generator_with_type_2c(t):
  156. for i in range(64):
  157. yield (np.array([i], dtype=t), np.array([i], dtype=t))
  158. def type_tester_with_type_check_2c(t, c):
  159. logger.info("Test with Type {}".format(t.__name__))
  160. # apply dataset operations
  161. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), ["data0", "data1"], column_types=c)
  162. data1 = data1.batch(4)
  163. i = 0
  164. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  165. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  166. np.testing.assert_array_equal(item["data0"], golden)
  167. i = i + 4
  168. def test_generator_7():
  169. """
  170. Test 2 column Generator on different data type with type check
  171. """
  172. logger.info("Test 2 column Generator on all data types with type check")
  173. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  174. np.float64]
  175. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  176. mstype.uint64, mstype.float32, mstype.float64]
  177. for i, _ in enumerate(np_types):
  178. type_tester_with_type_check_2c(np_types[i], [None, de_types[i]])
  179. def test_generator_8():
  180. """
  181. Test multi column generator with few mapops
  182. """
  183. logger.info("Test multi column generator with mapops to check the order too")
  184. # apply dataset operations
  185. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  186. data1 = data1.map(operations=(lambda x: x * 3), input_columns="col0", output_columns="out0",
  187. num_parallel_workers=2)
  188. data1 = data1.map(operations=(lambda x: (x * 7, x)), input_columns="col1", output_columns=["out1", "out2"],
  189. num_parallel_workers=2, column_order=["out0", "out1", "out2"])
  190. data1 = data1.map(operations=(lambda x: x + 1), input_columns="out2", output_columns="out2",
  191. num_parallel_workers=2)
  192. i = 0
  193. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  194. golden = np.array([i * 3])
  195. np.testing.assert_array_equal(item["out0"], golden)
  196. golden = np.array([[i * 7, (i + 1) * 7], [(i + 2) * 7, (i + 3) * 7]])
  197. np.testing.assert_array_equal(item["out1"], golden)
  198. golden = np.array([[i + 1, i + 2], [i + 3, i + 4]])
  199. np.testing.assert_array_equal(item["out2"], golden)
  200. i = i + 1
  201. def test_generator_9():
  202. """
  203. Test map column order when len(input_columns) == len(output_columns).
  204. """
  205. logger.info("Test map column order when len(input_columns) == len(output_columns).")
  206. # apply dataset operations
  207. data1 = ds.GeneratorDataset(generator_mc(2048), ["image", "label"])
  208. data2 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  209. data1 = data1.map(operations=(lambda x: x * 3), input_columns="label",
  210. num_parallel_workers=4)
  211. data2 = data2.map(operations=(lambda x: x * 3), input_columns="label",
  212. num_parallel_workers=4)
  213. # Expected column order is not changed.
  214. # data1 = data[0] is "image" and data[1] is "label"
  215. # data2 = data[0] is "label" and data[1] is "image"
  216. i = 0
  217. for data1, data2 in zip(data1, data2): # each data is a dictionary
  218. golden = np.array([i])
  219. np.testing.assert_array_equal(data1[0].asnumpy(), golden)
  220. golden = np.array([[i * 3, (i + 1) * 3], [(i + 2) * 3, (i + 3) * 3]])
  221. np.testing.assert_array_equal(data1[1].asnumpy(), golden)
  222. golden = np.array([i * 3])
  223. np.testing.assert_array_equal(data2[0].asnumpy(), golden)
  224. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  225. np.testing.assert_array_equal(data2[1].asnumpy(), golden)
  226. i = i + 1
  227. def test_generator_10():
  228. """
  229. Test map column order when len(input_columns) != len(output_columns).
  230. """
  231. logger.info("Test map column order when len(input_columns) != len(output_columns).")
  232. # apply dataset operations
  233. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  234. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  235. column_order=['col0', 'out1', 'out2'], num_parallel_workers=2)
  236. # Expected column order is |col0|out1|out2|
  237. i = 0
  238. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  239. golden = np.array([i])
  240. np.testing.assert_array_equal(item[0], golden)
  241. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  242. np.testing.assert_array_equal(item[1], golden)
  243. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  244. np.testing.assert_array_equal(item[2], golden)
  245. i = i + 1
  246. def test_generator_11():
  247. """
  248. Test map column order when len(input_columns) != len(output_columns).
  249. """
  250. logger.info("Test map column order when len(input_columns) != len(output_columns), "
  251. "and column_order drops some columns.")
  252. # apply dataset operations
  253. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  254. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  255. column_order=['out1', 'out2'], num_parallel_workers=2)
  256. # Expected column order is |out1|out2|
  257. i = 0
  258. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  259. # len should be 2 because col0 is dropped (not included in column_order)
  260. assert len(item) == 2
  261. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  262. np.testing.assert_array_equal(item[0], golden)
  263. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  264. np.testing.assert_array_equal(item[1], golden)
  265. i = i + 1
  266. def test_generator_12():
  267. """
  268. Test map column order when input_columns and output_columns are None.
  269. """
  270. logger.info("Test map column order when input_columns and output_columns are None.")
  271. # apply dataset operations
  272. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  273. data1 = data1.map(operations=(lambda x: (x * 5)), num_parallel_workers=2)
  274. # Expected column order is |col0|col1|
  275. i = 0
  276. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  277. assert len(item) == 2
  278. golden = np.array([i * 5])
  279. np.testing.assert_array_equal(item[0], golden)
  280. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  281. np.testing.assert_array_equal(item[1], golden)
  282. i = i + 1
  283. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  284. data1 = data1.map(operations=(lambda x: (x * 5)), column_order=["col1", "col0"], num_parallel_workers=2)
  285. # Expected column order is |col0|col1|
  286. i = 0
  287. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  288. assert len(item) == 2
  289. golden = np.array([i * 5])
  290. np.testing.assert_array_equal(item[1], golden)
  291. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  292. np.testing.assert_array_equal(item[0], golden)
  293. i = i + 1
  294. def test_generator_13():
  295. """
  296. Test map column order when input_columns is None.
  297. """
  298. logger.info("Test map column order when input_columns is None.")
  299. # apply dataset operations
  300. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  301. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2)
  302. # Expected column order is |out0|col1|
  303. i = 0
  304. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  305. assert len(item) == 2
  306. golden = np.array([i * 5])
  307. np.testing.assert_array_equal(item[0], golden)
  308. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  309. np.testing.assert_array_equal(item[1], golden)
  310. i = i + 1
  311. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  312. # len should be 2 because col0 is dropped (not included in column_order)
  313. assert len(item) == 2
  314. golden = np.array([i * 5])
  315. np.testing.assert_array_equal(item["out0"], golden)
  316. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  317. np.testing.assert_array_equal(item["col1"], golden)
  318. i = i + 1
  319. def test_generator_14():
  320. """
  321. Test 1D Generator MP + CPP sampler
  322. """
  323. logger.info("Test 1D Generator MP : 0 - 63")
  324. source = [(np.array([x]),) for x in range(256)]
  325. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(), num_parallel_workers=4).repeat(2)
  326. i = 0
  327. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  328. golden = np.array([i])
  329. np.testing.assert_array_equal(data["data"], golden)
  330. i = i + 1
  331. if i == 256:
  332. i = 0
  333. def test_generator_15():
  334. """
  335. Test 1D Generator MP + Python sampler
  336. """
  337. logger.info("Test 1D Generator MP : 0 - 63")
  338. sampler = [x for x in range(256)]
  339. source = [(np.array([x]),) for x in range(256)]
  340. ds1 = ds.GeneratorDataset(source, ["data"], sampler=sampler, num_parallel_workers=4).repeat(2)
  341. i = 0
  342. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  343. golden = np.array([i])
  344. np.testing.assert_array_equal(data["data"], golden)
  345. i = i + 1
  346. if i == 256:
  347. i = 0
  348. def test_generator_16():
  349. """
  350. Test multi column generator Mp + CPP sampler
  351. """
  352. logger.info("Test multi column generator")
  353. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  354. # apply dataset operations
  355. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler())
  356. i = 0
  357. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  358. golden = np.array([i])
  359. np.testing.assert_array_equal(item["col0"], golden)
  360. golden = np.array([i + 1])
  361. np.testing.assert_array_equal(item["col1"], golden)
  362. i = i + 1
  363. def test_generator_17():
  364. """
  365. Test multi column generator Mp + Python sampler
  366. """
  367. logger.info("Test multi column generator")
  368. sampler = [x for x in range(256)]
  369. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  370. # apply dataset operations
  371. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=sampler)
  372. i = 0
  373. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  374. golden = np.array([i])
  375. np.testing.assert_array_equal(item["col0"], golden)
  376. golden = np.array([i + 1])
  377. np.testing.assert_array_equal(item["col1"], golden)
  378. i = i + 1
  379. def test_generator_18():
  380. """
  381. Test multiprocessing flag (same as test 13 with python_multiprocessing=True flag)
  382. """
  383. logger.info("Test map column order when input_columns is None.")
  384. # apply dataset operations
  385. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"], python_multiprocessing=True)
  386. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2,
  387. python_multiprocessing=True)
  388. # Expected column order is |out0|col1|
  389. i = 0
  390. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  391. assert len(item) == 2
  392. golden = np.array([i * 5])
  393. np.testing.assert_array_equal(item[0], golden)
  394. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  395. np.testing.assert_array_equal(item[1], golden)
  396. i = i + 1
  397. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  398. # len should be 2 because col0 is dropped (not included in column_order)
  399. assert len(item) == 2
  400. golden = np.array([i * 5])
  401. np.testing.assert_array_equal(item["out0"], golden)
  402. def test_generator_19():
  403. """
  404. Test multiprocessing flag with 2 different large columns
  405. """
  406. logger.info("Test map column order when input_columns is None.")
  407. # apply dataset operations
  408. data1 = ds.GeneratorDataset(DatasetGeneratorLarge(), ["col0", "col1"], python_multiprocessing=True, shuffle=False)
  409. # Expected column order is |out0|col1|
  410. i = 0
  411. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  412. assert len(item) == 2
  413. golden = np.array(range(4000)) + i
  414. np.testing.assert_array_equal(item[0], golden)
  415. golden = np.array(range(4000)) * 10
  416. np.testing.assert_array_equal(item[1], golden)
  417. i = i + 1
  418. def test_generator_error_1():
  419. def generator_np():
  420. for i in range(64):
  421. yield (np.array([{i}]),)
  422. with pytest.raises(RuntimeError) as info:
  423. data1 = ds.GeneratorDataset(generator_np, ["data"])
  424. for _ in data1:
  425. pass
  426. assert "Invalid data type" in str(info.value)
  427. def test_generator_error_2():
  428. def generator_np():
  429. for i in range(64):
  430. yield ({i},)
  431. with pytest.raises(RuntimeError) as info:
  432. data1 = ds.GeneratorDataset(generator_np, ["data"])
  433. for _ in data1:
  434. pass
  435. print("========", str(info.value))
  436. assert "Generator should return a tuple of numpy arrays" in str(info.value)
  437. def test_generator_error_3():
  438. with pytest.raises(ValueError) as info:
  439. # apply dataset operations
  440. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  441. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"], output_columns=["out1", "out2"],
  442. num_parallel_workers=2)
  443. for _ in data1:
  444. pass
  445. assert "When length of input_columns and output_columns are not equal, column_order must be specified." in \
  446. str(info.value)
  447. def test_generator_error_4():
  448. with pytest.raises(RuntimeError) as info:
  449. # apply dataset operations
  450. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  451. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"],
  452. num_parallel_workers=2)
  453. for _ in data1:
  454. pass
  455. assert "Unexpected error. Result of a tensorOp doesn't match output column names" in str(info.value)
  456. def test_generator_sequential_sampler():
  457. source = [(np.array([x]),) for x in range(64)]
  458. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
  459. i = 0
  460. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  461. golden = np.array([i])
  462. np.testing.assert_array_equal(data["data"], golden)
  463. i = i + 1
  464. def test_generator_random_sampler():
  465. source = [(np.array([x]),) for x in range(64)]
  466. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=True)
  467. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  468. pass
  469. def test_generator_distributed_sampler():
  470. source = [(np.array([x]),) for x in range(64)]
  471. for sid in range(8):
  472. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=False, num_shards=8, shard_id=sid)
  473. i = sid
  474. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  475. golden = np.array([i])
  476. np.testing.assert_array_equal(data["data"], golden)
  477. i = i + 8
  478. def test_generator_num_samples():
  479. source = [(np.array([x]),) for x in range(64)]
  480. num_samples = 32
  481. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples))
  482. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples)
  483. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  484. count = 0
  485. for _ in ds1.create_dict_iterator(num_epochs=1):
  486. count = count + 1
  487. assert count == num_samples
  488. count = 0
  489. for _ in ds2.create_dict_iterator(num_epochs=1):
  490. count = count + 1
  491. assert count == num_samples
  492. count = 0
  493. for _ in ds3.create_dict_iterator(num_epochs=1):
  494. count = count + 1
  495. assert count == num_samples
  496. def test_generator_num_samples_underflow():
  497. source = [(np.array([x]),) for x in range(64)]
  498. num_samples = 256
  499. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(64)], num_samples=num_samples)
  500. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  501. count = 0
  502. for _ in ds2.create_dict_iterator(num_epochs=1):
  503. count = count + 1
  504. assert count == 64
  505. count = 0
  506. for _ in ds3.create_dict_iterator(num_epochs=1):
  507. count = count + 1
  508. assert count == 64
  509. def type_tester_with_type_check_2c_schema(t, c):
  510. logger.info("Test with Type {}".format(t.__name__))
  511. schema = ds.Schema()
  512. schema.add_column("data0", c[0])
  513. schema.add_column("data1", c[1])
  514. # apply dataset operations
  515. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema)
  516. data1 = data1.batch(4)
  517. i = 0
  518. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  519. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  520. np.testing.assert_array_equal(item["data0"], golden)
  521. i = i + 4
  522. def test_generator_schema():
  523. """
  524. Test 2 column Generator on different data type with type check with schema input
  525. """
  526. logger.info("Test 2 column Generator on all data types with type check")
  527. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  528. np.float64]
  529. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  530. mstype.uint64, mstype.float32, mstype.float64]
  531. for i, _ in enumerate(np_types):
  532. type_tester_with_type_check_2c_schema(np_types[i], [de_types[i], de_types[i]])
  533. def test_generator_dataset_size_0():
  534. """
  535. Test GeneratorDataset get_dataset_size by iterator method.
  536. """
  537. logger.info("Test 1D Generator : 0 - 63 get_dataset_size")
  538. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  539. data_size = data1.get_dataset_size()
  540. num_rows = 0
  541. for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  542. num_rows = num_rows + 1
  543. assert data_size == num_rows
  544. def test_generator_dataset_size_1():
  545. """
  546. Test GeneratorDataset get_dataset_size by __len__ method.
  547. """
  548. logger.info("Test DatasetGenerator get_dataset_size")
  549. dataset_generator = DatasetGenerator()
  550. data1 = ds.GeneratorDataset(dataset_generator, ["data"])
  551. data_size = data1.get_dataset_size()
  552. num_rows = 0
  553. for _ in data1.create_dict_iterator(num_epochs=1):
  554. num_rows = num_rows + 1
  555. assert data_size == num_rows
  556. def test_generator_dataset_size_2():
  557. """
  558. Test GeneratorDataset + repeat get_dataset_size
  559. """
  560. logger.info("Test 1D Generator + repeat get_dataset_size")
  561. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  562. data1 = data1.repeat(2)
  563. data_size = data1.get_dataset_size()
  564. num_rows = 0
  565. for _ in data1.create_dict_iterator(num_epochs=1):
  566. num_rows = num_rows + 1
  567. assert data_size == num_rows
  568. def test_generator_dataset_size_3():
  569. """
  570. Test GeneratorDataset + batch get_dataset_size
  571. """
  572. logger.info("Test 1D Generator + batch get_dataset_size")
  573. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  574. data1 = data1.batch(4)
  575. data_size = data1.get_dataset_size()
  576. num_rows = 0
  577. for _ in data1.create_dict_iterator(num_epochs=1):
  578. num_rows += 1
  579. assert data_size == num_rows
  580. def test_generator_dataset_size_4():
  581. """
  582. Test GeneratorDataset + num_shards
  583. """
  584. logger.info("Test 1D Generator : 0 - 63 + num_shards get_dataset_size")
  585. dataset_generator = DatasetGenerator()
  586. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  587. data_size = data1.get_dataset_size()
  588. num_rows = 0
  589. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  590. num_rows = num_rows + 1
  591. assert data_size == num_rows
  592. def test_generator_dataset_size_5():
  593. """
  594. Test get_dataset_size after create_dict_iterator
  595. """
  596. logger.info("Test get_dataset_size after create_dict_iterator")
  597. dataset_generator = DatasetGenerator()
  598. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  599. num_rows = 0
  600. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  601. num_rows = num_rows + 1
  602. data_size = data1.get_dataset_size()
  603. assert data_size == num_rows
  604. def manual_test_generator_keyboard_interrupt():
  605. """
  606. Test keyboard_interrupt
  607. """
  608. logger.info("Test 1D Generator MP : 0 - 63")
  609. class MyDS():
  610. def __getitem__(self, item):
  611. while True:
  612. pass
  613. def __len__(self):
  614. return 1024
  615. ds1 = ds.GeneratorDataset(MyDS(), ["data"], num_parallel_workers=4).repeat(2)
  616. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  617. pass
  618. def test_explicit_deepcopy():
  619. """
  620. Test explicit_deepcopy
  621. """
  622. logger.info("Test explicit_deepcopy")
  623. ds1 = ds.NumpySlicesDataset([1, 2], shuffle=False)
  624. ds2 = copy.deepcopy(ds1)
  625. for d1, d2 in zip(ds1, ds2):
  626. assert d1 == d2
  627. if __name__ == "__main__":
  628. test_generator_0()
  629. test_generator_1()
  630. test_generator_2()
  631. test_generator_3()
  632. test_generator_4()
  633. test_generator_5()
  634. test_generator_6()
  635. test_generator_7()
  636. test_generator_8()
  637. test_generator_9()
  638. test_generator_10()
  639. test_generator_11()
  640. test_generator_12()
  641. test_generator_13()
  642. test_generator_14()
  643. test_generator_15()
  644. test_generator_16()
  645. test_generator_17()
  646. test_generator_18()
  647. test_generator_19()
  648. test_generator_error_1()
  649. test_generator_error_2()
  650. test_generator_error_3()
  651. test_generator_error_4()
  652. test_generator_sequential_sampler()
  653. test_generator_distributed_sampler()
  654. test_generator_random_sampler()
  655. test_generator_num_samples()
  656. test_generator_num_samples_underflow()
  657. test_generator_schema()
  658. test_generator_dataset_size_0()
  659. test_generator_dataset_size_1()
  660. test_generator_dataset_size_2()
  661. test_generator_dataset_size_3()
  662. test_generator_dataset_size_4()
  663. test_generator_dataset_size_5()
  664. test_explicit_deepcopy()