You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_datasets_generator.py 75 kB

4 years ago
4 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066
  1. # Copyright 2019 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. import copy
  16. import numpy as np
  17. import pytest
  18. import mindspore
  19. import mindspore.common.dtype as mstype
  20. import mindspore.dataset as ds
  21. import mindspore.dataset.engine.iterators as it
  22. from mindspore import log as logger
  23. from mindspore import Tensor
  24. import mindspore.ops as ops
  25. # Generate 1d int numpy array from 0 - 63
  26. def generator_1d():
  27. for i in range(64):
  28. yield (np.array([i]),)
  29. class DatasetGenerator:
  30. def __init__(self):
  31. pass
  32. def __getitem__(self, item):
  33. return (np.array([item]),)
  34. def __len__(self):
  35. return 10
  36. class DatasetGeneratorLarge:
  37. def __init__(self):
  38. self.data = np.array(range(4000))
  39. def __getitem__(self, item):
  40. return (self.data + item, self.data *10)
  41. def __len__(self):
  42. return 10
  43. class DatasetGeneratorMixed:
  44. def __init__(self):
  45. pass
  46. def __getitem__(self, item):
  47. flatten = ops.Flatten()
  48. x = Tensor(np.ones(shape=[2, 3]), mindspore.float32)
  49. output = flatten(x)
  50. return (output.asnumpy(),)
  51. def __len__(self):
  52. return 10
  53. def test_generator_0():
  54. """
  55. Test 1D Generator
  56. """
  57. logger.info("Test 1D Generator : 0 - 63")
  58. # apply dataset operations
  59. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  60. i = 0
  61. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  62. golden = np.array([i])
  63. np.testing.assert_array_equal(item["data"], golden)
  64. i = i + 1
  65. # Generate md int numpy array from [[0, 1], [2, 3]] to [[63, 64], [65, 66]]
  66. def generator_md():
  67. for i in range(64):
  68. yield (np.array([[i, i + 1], [i + 2, i + 3]]),)
  69. def test_generator_1():
  70. """
  71. Test MD Generator
  72. """
  73. logger.info("Test MD Generator : 0 - 63, with shape [2, 2]")
  74. # apply dataset operations
  75. data1 = ds.GeneratorDataset(generator_md, ["data"])
  76. i = 0
  77. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  78. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  79. np.testing.assert_array_equal(item["data"], golden)
  80. i = i + 1
  81. # Generate two columns, the first column is from Generator1D, the second column is from GeneratorMD
  82. def generator_mc(maxid=64):
  83. for i in range(maxid):
  84. yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]]))
  85. def test_generator_2():
  86. """
  87. Test multi column generator
  88. """
  89. logger.info("Test multi column generator")
  90. # apply dataset operations
  91. data1 = ds.GeneratorDataset(generator_mc, ["col0", "col1"])
  92. i = 0
  93. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  94. golden = np.array([i])
  95. np.testing.assert_array_equal(item["col0"], golden)
  96. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  97. np.testing.assert_array_equal(item["col1"], golden)
  98. i = i + 1
  99. def test_generator_3():
  100. """
  101. Test 1D Generator + repeat(4)
  102. """
  103. logger.info("Test 1D Generator : 0 - 63 + Repeat(4)")
  104. # apply dataset operations
  105. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  106. data1 = data1.repeat(4)
  107. i = 0
  108. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  109. golden = np.array([i])
  110. np.testing.assert_array_equal(item["data"], golden)
  111. i = i + 1
  112. if i == 64:
  113. i = 0
  114. def test_generator_4():
  115. """
  116. Test fixed size 1D Generator + batch
  117. """
  118. logger.info("Test 1D Generator : 0 - 63 + batch(4)")
  119. # apply dataset operations
  120. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  121. data1 = data1.batch(4)
  122. i = 0
  123. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  124. golden = np.array([[i], [i + 1], [i + 2], [i + 3]])
  125. np.testing.assert_array_equal(item["data"], golden)
  126. i = i + 4
  127. def generator_with_type(t):
  128. for i in range(64):
  129. yield (np.array([i], dtype=t),)
  130. def type_tester(t):
  131. logger.info("Test with Type {}".format(t.__name__))
  132. # apply dataset operations
  133. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"])
  134. data1 = data1.batch(4)
  135. i = 0
  136. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  137. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  138. np.testing.assert_array_equal(item["data"], golden)
  139. i = i + 4
  140. def test_generator_5():
  141. """
  142. Test 1D Generator on different data type
  143. """
  144. logger.info("Test 1D Generator on all data types")
  145. types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64]
  146. for t in types:
  147. type_tester(t)
  148. def type_tester_with_type_check(t, c):
  149. logger.info("Test with Type {}".format(t.__name__))
  150. # apply dataset operations
  151. data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"], column_types=[c])
  152. data1 = data1.batch(4)
  153. i = 0
  154. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  155. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  156. np.testing.assert_array_equal(item["data"], golden)
  157. i = i + 4
  158. def test_generator_6():
  159. """
  160. Test 1D Generator on different data type with type check
  161. """
  162. logger.info("Test 1D Generator on all data types with type check")
  163. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  164. np.float64]
  165. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  166. mstype.uint64, mstype.float32, mstype.float64]
  167. for i, _ in enumerate(np_types):
  168. type_tester_with_type_check(np_types[i], de_types[i])
  169. def generator_with_type_2c(t):
  170. for i in range(64):
  171. yield (np.array([i], dtype=t), np.array([i], dtype=t))
  172. def type_tester_with_type_check_2c(t, c):
  173. logger.info("Test with Type {}".format(t.__name__))
  174. # apply dataset operations
  175. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), ["data0", "data1"], column_types=c)
  176. data1 = data1.batch(4)
  177. i = 0
  178. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  179. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  180. np.testing.assert_array_equal(item["data0"], golden)
  181. i = i + 4
  182. def test_generator_7():
  183. """
  184. Test 2 column Generator on different data type with type check
  185. """
  186. logger.info("Test 2 column Generator on all data types with type check")
  187. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  188. np.float64]
  189. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  190. mstype.uint64, mstype.float32, mstype.float64]
  191. for i, _ in enumerate(np_types):
  192. type_tester_with_type_check_2c(np_types[i], [None, de_types[i]])
  193. def test_generator_8():
  194. """
  195. Test multi column generator with few mapops
  196. """
  197. logger.info("Test multi column generator with mapops to check the order too")
  198. # apply dataset operations
  199. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  200. data1 = data1.map(operations=(lambda x: x * 3), input_columns="col0", output_columns="out0",
  201. num_parallel_workers=2)
  202. data1 = data1.map(operations=(lambda x: (x * 7, x)), input_columns="col1", output_columns=["out1", "out2"],
  203. num_parallel_workers=2, column_order=["out0", "out1", "out2"])
  204. data1 = data1.map(operations=(lambda x: x + 1), input_columns="out2", output_columns="out2",
  205. num_parallel_workers=2)
  206. i = 0
  207. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  208. golden = np.array([i * 3])
  209. np.testing.assert_array_equal(item["out0"], golden)
  210. golden = np.array([[i * 7, (i + 1) * 7], [(i + 2) * 7, (i + 3) * 7]])
  211. np.testing.assert_array_equal(item["out1"], golden)
  212. golden = np.array([[i + 1, i + 2], [i + 3, i + 4]])
  213. np.testing.assert_array_equal(item["out2"], golden)
  214. i = i + 1
  215. def test_generator_9():
  216. """
  217. Test map column order when len(input_columns) == len(output_columns).
  218. """
  219. logger.info("Test map column order when len(input_columns) == len(output_columns).")
  220. # apply dataset operations
  221. data1 = ds.GeneratorDataset(generator_mc(2048), ["image", "label"])
  222. data2 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  223. data1 = data1.map(operations=(lambda x: x * 3), input_columns="label",
  224. num_parallel_workers=4)
  225. data2 = data2.map(operations=(lambda x: x * 3), input_columns="label",
  226. num_parallel_workers=4)
  227. # Expected column order is not changed.
  228. # data1 = data[0] is "image" and data[1] is "label"
  229. # data2 = data[0] is "label" and data[1] is "image"
  230. i = 0
  231. for data1, data2 in zip(data1, data2): # each data is a dictionary
  232. golden = np.array([i])
  233. np.testing.assert_array_equal(data1[0].asnumpy(), golden)
  234. golden = np.array([[i * 3, (i + 1) * 3], [(i + 2) * 3, (i + 3) * 3]])
  235. np.testing.assert_array_equal(data1[1].asnumpy(), golden)
  236. golden = np.array([i * 3])
  237. np.testing.assert_array_equal(data2[0].asnumpy(), golden)
  238. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  239. np.testing.assert_array_equal(data2[1].asnumpy(), golden)
  240. i = i + 1
  241. def test_generator_10():
  242. """
  243. Test map column order when len(input_columns) != len(output_columns).
  244. """
  245. logger.info("Test map column order when len(input_columns) != len(output_columns).")
  246. # apply dataset operations
  247. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  248. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  249. column_order=['col0', 'out1', 'out2'], num_parallel_workers=2)
  250. # Expected column order is |col0|out1|out2|
  251. i = 0
  252. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  253. golden = np.array([i])
  254. np.testing.assert_array_equal(item[0], golden)
  255. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  256. np.testing.assert_array_equal(item[1], golden)
  257. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  258. np.testing.assert_array_equal(item[2], golden)
  259. i = i + 1
  260. def test_generator_11():
  261. """
  262. Test map column order when len(input_columns) != len(output_columns).
  263. """
  264. logger.info("Test map column order when len(input_columns) != len(output_columns), "
  265. "and column_order drops some columns.")
  266. # apply dataset operations
  267. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  268. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"],
  269. column_order=['out1', 'out2'], num_parallel_workers=2)
  270. # Expected column order is |out1|out2|
  271. i = 0
  272. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  273. # len should be 2 because col0 is dropped (not included in column_order)
  274. assert len(item) == 2
  275. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  276. np.testing.assert_array_equal(item[0], golden)
  277. golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]])
  278. np.testing.assert_array_equal(item[1], golden)
  279. i = i + 1
  280. def test_generator_12():
  281. """
  282. Test map column order when input_columns and output_columns are None.
  283. """
  284. logger.info("Test map column order when input_columns and output_columns are None.")
  285. # apply dataset operations
  286. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  287. data1 = data1.map(operations=(lambda x: (x * 5)), num_parallel_workers=2)
  288. # Expected column order is |col0|col1|
  289. i = 0
  290. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  291. assert len(item) == 2
  292. golden = np.array([i * 5])
  293. np.testing.assert_array_equal(item[0], golden)
  294. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  295. np.testing.assert_array_equal(item[1], golden)
  296. i = i + 1
  297. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  298. data1 = data1.map(operations=(lambda x: (x * 5)), column_order=["col1", "col0"], num_parallel_workers=2)
  299. # Expected column order is |col0|col1|
  300. i = 0
  301. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  302. assert len(item) == 2
  303. golden = np.array([i * 5])
  304. np.testing.assert_array_equal(item[1], golden)
  305. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  306. np.testing.assert_array_equal(item[0], golden)
  307. i = i + 1
  308. def test_generator_13():
  309. """
  310. Test map column order when input_columns is None.
  311. """
  312. logger.info("Test map column order when input_columns is None.")
  313. # apply dataset operations
  314. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"])
  315. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2)
  316. # Expected column order is |out0|col1|
  317. i = 0
  318. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  319. assert len(item) == 2
  320. golden = np.array([i * 5])
  321. np.testing.assert_array_equal(item[0], golden)
  322. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  323. np.testing.assert_array_equal(item[1], golden)
  324. i = i + 1
  325. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  326. # len should be 2 because col0 is dropped (not included in column_order)
  327. assert len(item) == 2
  328. golden = np.array([i * 5])
  329. np.testing.assert_array_equal(item["out0"], golden)
  330. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  331. np.testing.assert_array_equal(item["col1"], golden)
  332. i = i + 1
  333. def test_generator_14():
  334. """
  335. Test 1D Generator MP + CPP sampler
  336. """
  337. logger.info("Test 1D Generator MP : 0 - 63")
  338. # Sometimes there are some ITERATORS left in ITERATORS_LIST when run all UTs together,
  339. # and cause core dump and blocking in this UT. Add cleanup() here to fix it.
  340. it._cleanup() # pylint: disable=W0212
  341. # Reduce memory needed by reducing queue size
  342. prefetch_original = ds.config.get_prefetch_size()
  343. ds.config.set_prefetch_size(1)
  344. source = [(np.array([x]),) for x in range(256)]
  345. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(),
  346. num_parallel_workers=4, max_rowsize=1).repeat(2)
  347. i = 0
  348. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  349. golden = np.array([i])
  350. np.testing.assert_array_equal(data["data"], golden)
  351. i = i + 1
  352. if i == 256:
  353. i = 0
  354. ds.config.set_prefetch_size(prefetch_original)
  355. def test_generator_15():
  356. """
  357. Test 1D Generator MP + Python sampler
  358. """
  359. logger.info("Test 1D Generator MP : 0 - 63")
  360. ## Reduce memory needed by reducing queue size
  361. prefetch_original = ds.config.get_prefetch_size()
  362. ds.config.set_prefetch_size(1)
  363. sampler = [x for x in range(256)]
  364. source = [(np.array([x]),) for x in range(256)]
  365. ds1 = ds.GeneratorDataset(source, ["data"], sampler=sampler,
  366. num_parallel_workers=4, max_rowsize=1).repeat(1)
  367. i = 0
  368. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  369. golden = np.array([i])
  370. np.testing.assert_array_equal(data["data"], golden)
  371. i = i + 1
  372. if i == 256:
  373. i = 0
  374. ds.config.set_prefetch_size(prefetch_original)
  375. def test_generator_16():
  376. """
  377. Test multi column generator Mp + CPP sampler
  378. """
  379. logger.info("Test multi column generator")
  380. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  381. # apply dataset operations
  382. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler())
  383. i = 0
  384. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  385. golden = np.array([i])
  386. np.testing.assert_array_equal(item["col0"], golden)
  387. golden = np.array([i + 1])
  388. np.testing.assert_array_equal(item["col1"], golden)
  389. i = i + 1
  390. def test_generator_17():
  391. """
  392. Test multi column generator Mp + Python sampler
  393. """
  394. logger.info("Test multi column generator")
  395. sampler = [x for x in range(256)]
  396. source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
  397. # apply dataset operations
  398. data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=sampler)
  399. i = 0
  400. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  401. golden = np.array([i])
  402. np.testing.assert_array_equal(item["col0"], golden)
  403. golden = np.array([i + 1])
  404. np.testing.assert_array_equal(item["col1"], golden)
  405. i = i + 1
  406. def test_generator_18():
  407. """
  408. Test multiprocessing flag (same as test 13 with python_multiprocessing=True flag)
  409. """
  410. logger.info("Test map column order when input_columns is None.")
  411. # Reduce shm usage by disabling this optimization
  412. mem_original = ds.config.get_enable_shared_mem()
  413. ds.config.set_enable_shared_mem(False)
  414. # apply dataset operations
  415. data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"], python_multiprocessing=True)
  416. data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2,
  417. python_multiprocessing=True)
  418. # Expected column order is |out0|col1|
  419. i = 0
  420. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  421. assert len(item) == 2
  422. golden = np.array([i * 5])
  423. np.testing.assert_array_equal(item[0], golden)
  424. golden = np.array([[i, i + 1], [i + 2, i + 3]])
  425. np.testing.assert_array_equal(item[1], golden)
  426. i = i + 1
  427. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  428. # len should be 2 because col0 is dropped (not included in column_order)
  429. assert len(item) == 2
  430. golden = np.array([i * 5])
  431. np.testing.assert_array_equal(item["out0"], golden)
  432. ds.config.set_enable_shared_mem(mem_original)
  433. def test_generator_19():
  434. """
  435. Test multiprocessing flag with 2 different large columns
  436. """
  437. logger.info("Test map column order when input_columns is None.")
  438. # apply dataset operations
  439. data1 = ds.GeneratorDataset(DatasetGeneratorLarge(), ["col0", "col1"], python_multiprocessing=True, shuffle=False)
  440. # Expected column order is |out0|col1|
  441. i = 0
  442. for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
  443. assert len(item) == 2
  444. golden = np.array(range(4000)) + i
  445. np.testing.assert_array_equal(item[0], golden)
  446. golden = np.array(range(4000)) * 10
  447. np.testing.assert_array_equal(item[1], golden)
  448. i = i + 1
  449. class RandomAccessDataset:
  450. def __init__(self):
  451. self.__data = np.random.sample((5, 1))
  452. def __getitem__(self, item):
  453. return self.__data[item]
  454. def __len__(self):
  455. return 5
  456. class RandomAccessDatasetWithoutLen:
  457. def __init__(self):
  458. self.__data = np.random.sample((5, 1))
  459. def __getitem__(self, item):
  460. return self.__data[item]
  461. class IterableDataset:
  462. def __init__(self):
  463. self.count = 0
  464. self.max = 10
  465. def __iter__(self):
  466. return self
  467. def __next__(self):
  468. if self.count >= self.max:
  469. raise StopIteration
  470. self.count += 1
  471. return (np.array(self.count),)
  472. def test_generator_20():
  473. """
  474. Test mappable and unmappable dataset as source for GeneratorDataset.
  475. """
  476. logger.info("Test mappable and unmappable dataset as source for GeneratorDataset.")
  477. # Mappable dataset
  478. data1 = ds.GeneratorDataset(RandomAccessDataset(), ["col0"])
  479. dataset_size1 = data1.get_dataset_size()
  480. assert dataset_size1 == 5
  481. # Mappable dataset without __len__
  482. data2 = ds.GeneratorDataset(RandomAccessDatasetWithoutLen(), ["col0"])
  483. try:
  484. data2.get_dataset_size()
  485. except RuntimeError as e:
  486. assert "'__len__' method is required" in str(e)
  487. # Unmappable dataset
  488. data3 = ds.GeneratorDataset(IterableDataset(), ["col0"])
  489. dataset_size3 = data3.get_dataset_size()
  490. assert dataset_size3 == 10
  491. def test_generator_error_1():
  492. def generator_np():
  493. for i in range(64):
  494. yield (np.array([{i}]),)
  495. with pytest.raises(RuntimeError) as info:
  496. data1 = ds.GeneratorDataset(generator_np, ["data"])
  497. for _ in data1:
  498. pass
  499. assert "Data type of 1th item of the input or its converted Numpy array is expected" in str(info.value)
  500. def test_generator_error_2():
  501. def generator_np():
  502. for i in range(64):
  503. yield ({i},)
  504. with pytest.raises(RuntimeError) as info:
  505. data1 = ds.GeneratorDataset(generator_np, ["data"])
  506. for _ in data1:
  507. pass
  508. assert "Data type of 1th item of the input or its converted Numpy array is expected" in str(info.value)
  509. def test_generator_error_3():
  510. with pytest.raises(ValueError) as info:
  511. # apply dataset operations
  512. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  513. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"], output_columns=["out1", "out2"],
  514. num_parallel_workers=2)
  515. for _ in data1:
  516. pass
  517. assert "When length of input_columns and output_columns are not equal, column_order must be specified." in \
  518. str(info.value)
  519. def test_generator_error_4():
  520. with pytest.raises(RuntimeError) as info:
  521. # apply dataset operations
  522. data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"])
  523. data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"],
  524. num_parallel_workers=2)
  525. for _ in data1:
  526. pass
  527. assert "the number of columns returned in 'map' operations should match the number of 'output_columns'"\
  528. in str(info.value)
  529. def test_generator_sequential_sampler():
  530. source = [(np.array([x]),) for x in range(64)]
  531. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
  532. i = 0
  533. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  534. golden = np.array([i])
  535. np.testing.assert_array_equal(data["data"], golden)
  536. i = i + 1
  537. def test_generator_random_sampler():
  538. source = [(np.array([x]),) for x in range(64)]
  539. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=True)
  540. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  541. pass
  542. def test_generator_distributed_sampler():
  543. source = [(np.array([x]),) for x in range(64)]
  544. for sid in range(8):
  545. ds1 = ds.GeneratorDataset(source, ["data"], shuffle=False, num_shards=8, shard_id=sid)
  546. i = sid
  547. for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  548. golden = np.array([i])
  549. np.testing.assert_array_equal(data["data"], golden)
  550. i = i + 8
  551. def test_generator_num_samples():
  552. source = [(np.array([x]),) for x in range(64)]
  553. num_samples = 32
  554. ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples))
  555. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples)
  556. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  557. count = 0
  558. for _ in ds1.create_dict_iterator(num_epochs=1):
  559. count = count + 1
  560. assert count == num_samples
  561. count = 0
  562. for _ in ds2.create_dict_iterator(num_epochs=1):
  563. count = count + 1
  564. assert count == num_samples
  565. count = 0
  566. for _ in ds3.create_dict_iterator(num_epochs=1):
  567. count = count + 1
  568. assert count == num_samples
  569. def test_generator_num_samples_underflow():
  570. source = [(np.array([x]),) for x in range(64)]
  571. num_samples = 256
  572. ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(64)], num_samples=num_samples)
  573. ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)
  574. count = 0
  575. for _ in ds2.create_dict_iterator(num_epochs=1):
  576. count = count + 1
  577. assert count == 64
  578. count = 0
  579. for _ in ds3.create_dict_iterator(num_epochs=1):
  580. count = count + 1
  581. assert count == 64
  582. def type_tester_with_type_check_2c_schema(t, c):
  583. logger.info("Test with Type {}".format(t.__name__))
  584. schema = ds.Schema()
  585. schema.add_column("data0", c[0])
  586. schema.add_column("data1", c[1])
  587. # apply dataset operations
  588. data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema)
  589. data1 = data1.batch(4)
  590. i = 0
  591. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  592. golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t)
  593. np.testing.assert_array_equal(item["data0"], golden)
  594. i = i + 4
  595. def test_generator_schema():
  596. """
  597. Test 2 column Generator on different data type with type check with schema input
  598. """
  599. logger.info("Test 2 column Generator on all data types with type check")
  600. np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32,
  601. np.float64]
  602. de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32,
  603. mstype.uint64, mstype.float32, mstype.float64]
  604. for i, _ in enumerate(np_types):
  605. type_tester_with_type_check_2c_schema(np_types[i], [de_types[i], de_types[i]])
  606. def test_generator_dataset_size_0():
  607. """
  608. Test GeneratorDataset get_dataset_size by iterator method.
  609. """
  610. logger.info("Test 1D Generator : 0 - 63 get_dataset_size")
  611. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  612. data_size = data1.get_dataset_size()
  613. num_rows = 0
  614. for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  615. num_rows = num_rows + 1
  616. assert data_size == num_rows
  617. def test_generator_dataset_size_1():
  618. """
  619. Test GeneratorDataset get_dataset_size by __len__ method.
  620. """
  621. logger.info("Test DatasetGenerator get_dataset_size")
  622. dataset_generator = DatasetGenerator()
  623. data1 = ds.GeneratorDataset(dataset_generator, ["data"])
  624. data_size = data1.get_dataset_size()
  625. num_rows = 0
  626. for _ in data1.create_dict_iterator(num_epochs=1):
  627. num_rows = num_rows + 1
  628. assert data_size == num_rows
  629. def test_generator_dataset_size_2():
  630. """
  631. Test GeneratorDataset + repeat get_dataset_size
  632. """
  633. logger.info("Test 1D Generator + repeat get_dataset_size")
  634. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  635. data1 = data1.repeat(2)
  636. data_size = data1.get_dataset_size()
  637. num_rows = 0
  638. for _ in data1.create_dict_iterator(num_epochs=1):
  639. num_rows = num_rows + 1
  640. assert data_size == num_rows
  641. def test_generator_dataset_size_3():
  642. """
  643. Test GeneratorDataset + batch get_dataset_size
  644. """
  645. logger.info("Test 1D Generator + batch get_dataset_size")
  646. data1 = ds.GeneratorDataset(generator_1d, ["data"])
  647. data1 = data1.batch(4)
  648. data_size = data1.get_dataset_size()
  649. num_rows = 0
  650. for _ in data1.create_dict_iterator(num_epochs=1):
  651. num_rows += 1
  652. assert data_size == num_rows
  653. def test_generator_dataset_size_4():
  654. """
  655. Test GeneratorDataset + num_shards
  656. """
  657. logger.info("Test 1D Generator : 0 - 63 + num_shards get_dataset_size")
  658. dataset_generator = DatasetGenerator()
  659. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  660. data_size = data1.get_dataset_size()
  661. num_rows = 0
  662. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  663. num_rows = num_rows + 1
  664. assert data_size == num_rows
  665. def test_generator_dataset_size_5():
  666. """
  667. Test get_dataset_size after create_dict_iterator
  668. """
  669. logger.info("Test get_dataset_size after create_dict_iterator")
  670. dataset_generator = DatasetGenerator()
  671. data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0)
  672. num_rows = 0
  673. for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  674. num_rows = num_rows + 1
  675. data_size = data1.get_dataset_size()
  676. assert data_size == num_rows
  677. def manual_test_generator_keyboard_interrupt():
  678. """
  679. Test keyboard_interrupt
  680. """
  681. logger.info("Test 1D Generator MP : 0 - 63")
  682. class MyDS():
  683. def __getitem__(self, item):
  684. while True:
  685. pass
  686. def __len__(self):
  687. return 1024
  688. ds1 = ds.GeneratorDataset(MyDS(), ["data"], num_parallel_workers=4).repeat(2)
  689. for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  690. pass
  691. def test_explicit_deepcopy():
  692. """
  693. Test explicit_deepcopy
  694. """
  695. logger.info("Test explicit_deepcopy")
  696. ds1 = ds.NumpySlicesDataset([1, 2], shuffle=False)
  697. ds2 = copy.deepcopy(ds1)
  698. for d1, d2 in zip(ds1, ds2):
  699. assert d1 == d2
  700. def test_func_generator_dataset_005():
  701. """
  702. generator: class __getitem__
  703. """
  704. result = [np.random.randn(242, 242, 242), np.random.randn(42, 24, 442)]
  705. class MyData():
  706. def __init__(self, input_para):
  707. self.data = input_para
  708. def __getitem__(self, item):
  709. return (Tensor(self.data[0]), Tensor(self.data[1]))
  710. def __len__(self):
  711. return 2
  712. column_names = ["col1", "col2"]
  713. dataset = ds.GeneratorDataset(MyData(result), column_names)
  714. i = 0
  715. for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
  716. assert "col1" in str(data.keys())
  717. assert (data["col1"] == result[0]).all()
  718. assert (data["col2"] == result[1]).all()
  719. i += 1
  720. assert i == 2
  721. def test_func_generator_dataset_with_zip_source():
  722. """
  723. Feature: verify the source is zip
  724. Description: the source input is zip
  725. Expectation: success
  726. """
  727. def synthetic_data(w, b, num_examples):
  728. """生成 y = Xw + b + 噪声。"""
  729. X = np.random.normal(0, 1, (num_examples, len(w)))
  730. y = np.matmul(X, w) + b
  731. y += np.random.normal(0, 0.01, y.shape)
  732. return X.astype(np.float32), y.reshape((-1, 1)).astype(np.float32)
  733. true_w = np.array([2, -3.4])
  734. true_b = 4.2
  735. features, labels = synthetic_data(true_w, true_b, 10)
  736. def load_array(data_arrays, column_names, batch_size, is_train=True):
  737. """构造一个MindSpore数据迭代器。"""
  738. dataset = ds.GeneratorDataset(data_arrays, column_names, shuffle=is_train)
  739. dataset = dataset.batch(batch_size)
  740. return dataset
  741. batch_size = 2
  742. dataset = load_array(zip(features, labels), ['features', 'labels'], batch_size)
  743. count = 0
  744. epochs = 10
  745. dataset_iter = dataset.create_dict_iterator(num_epochs=epochs, output_numpy=True)
  746. for _ in range(epochs):
  747. for _ in dataset_iter:
  748. count += 1
  749. assert count == 50
  750. def test_generator_mixed_operator():
  751. """
  752. Feature: Test adding computing operator into user defined dataset
  753. Description: will decrease num_parallel_worker into 1
  754. Expectation: success
  755. """
  756. logger.info("Test adding computing operator into user defined dataset.")
  757. # create dataset
  758. data1 = ds.GeneratorDataset(DatasetGeneratorMixed(), ["col0"], shuffle=False, python_multiprocessing=False)
  759. assert data1.num_parallel_workers == 1
  760. for _ in data1.create_tuple_iterator(num_epochs=1):
  761. pass
  762. def test_generator_single_input_0():
  763. """
  764. Feature: Test single int input
  765. Description: input int
  766. Expectation: success
  767. """
  768. def generator_int():
  769. for i in range(64):
  770. yield i
  771. class RandomAccessDatasetInner:
  772. def __init__(self):
  773. self.__data = [i for i in range(64)]
  774. def __getitem__(self, item):
  775. return self.__data[item]
  776. def __len__(self):
  777. return 64
  778. class SequentialAccessDataset:
  779. def __init__(self):
  780. self.__data = [i for i in range(64)]
  781. self.__index = 0
  782. def __next__(self):
  783. if self.__index >= 64:
  784. raise StopIteration
  785. item = self.__data[self.__index]
  786. self.__index += 1
  787. return item
  788. def __iter__(self):
  789. self.__index = 0
  790. return self
  791. def __len__(self):
  792. return 64
  793. def assert_generator_single_input_0(data):
  794. # apply dataset operations
  795. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  796. i = 0
  797. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  798. golden = np.array(i)
  799. np.testing.assert_equal(item["data"], golden)
  800. i = i + 1
  801. assert_generator_single_input_0(generator_int)
  802. assert_generator_single_input_0(RandomAccessDatasetInner())
  803. assert_generator_single_input_0(SequentialAccessDataset())
  804. def test_generator_single_input_1():
  805. """
  806. Feature: Test single float input
  807. Description: input float
  808. Expectation: success
  809. """
  810. def generator_float():
  811. for i in range(64):
  812. yield i * 0.1
  813. class RandomAccessDatasetInner:
  814. def __init__(self):
  815. self.__data = [i for i in range(64)]
  816. def __getitem__(self, item):
  817. return self.__data[item] * 0.1
  818. def __len__(self):
  819. return 64
  820. class SequentialAccessDataset:
  821. def __init__(self):
  822. self.__data = [i for i in range(64)]
  823. self.__index = 0
  824. def __next__(self):
  825. if self.__index >= 64:
  826. raise StopIteration
  827. item = self.__data[self.__index] * 0.1
  828. self.__index += 1
  829. return item
  830. def __iter__(self):
  831. self.__index = 0
  832. return self
  833. def __len__(self):
  834. return 64
  835. def assert_generator_single_input_1(data):
  836. # apply dataset operations
  837. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  838. i = 0.0
  839. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  840. golden = np.array(i)
  841. np.testing.assert_almost_equal(item["data"], golden)
  842. i = i + 0.1
  843. assert_generator_single_input_1(generator_float)
  844. assert_generator_single_input_1(RandomAccessDatasetInner())
  845. assert_generator_single_input_1(SequentialAccessDataset())
  846. def test_generator_single_input_2():
  847. """
  848. Feature: Test single str input
  849. Description: input str
  850. Expectation: success
  851. """
  852. def generator_str():
  853. for i in range(64):
  854. yield chr(ord('a') + i)
  855. class RandomAccessDatasetInner:
  856. def __init__(self):
  857. self.__data = [i for i in range(64)]
  858. def __getitem__(self, item):
  859. return chr(ord('a') + self.__data[item])
  860. def __len__(self):
  861. return 64
  862. class SequentialAccessDataset:
  863. def __init__(self):
  864. self.__data = [i for i in range(64)]
  865. self.__index = 0
  866. def __next__(self):
  867. if self.__index >= 64:
  868. raise StopIteration
  869. item = chr(ord('a') + self.__data[self.__index])
  870. self.__index += 1
  871. return item
  872. def __iter__(self):
  873. self.__index = 0
  874. return self
  875. def __len__(self):
  876. return 64
  877. def assert_generator_single_input_2(data):
  878. # apply dataset operations
  879. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  880. i = 0
  881. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  882. s = chr(ord('a') + i)
  883. golden = np.array(bytes(s, encoding='utf8'))
  884. np.testing.assert_array_equal(item["data"], golden)
  885. i = i + 1
  886. assert_generator_single_input_2(generator_str)
  887. assert_generator_single_input_2(RandomAccessDatasetInner())
  888. assert_generator_single_input_2(SequentialAccessDataset())
  889. def test_generator_single_input_3():
  890. """
  891. Feature: Test single bytes input
  892. Description: input bytes
  893. Expectation: success
  894. """
  895. def generator_bytes():
  896. for i in range(64):
  897. yield bytes('a' * i, encoding='UTF-8')
  898. class RandomAccessDatasetInner:
  899. def __init__(self):
  900. self.__data = [bytes('a' * i, encoding='UTF-8') for i in range(64)]
  901. def __getitem__(self, item):
  902. return self.__data[item]
  903. def __len__(self):
  904. return 64
  905. class SequentialAccessDataset:
  906. def __init__(self):
  907. self.__data = [bytes('a' * i, encoding='UTF-8') for i in range(64)]
  908. self.__index = 0
  909. def __next__(self):
  910. if self.__index >= 64:
  911. raise StopIteration
  912. item = self.__data[self.__index]
  913. self.__index += 1
  914. return item
  915. def __iter__(self):
  916. self.__index = 0
  917. return self
  918. def __len__(self):
  919. return 64
  920. def assert_generator_single_input_3(data):
  921. # apply dataset operations
  922. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  923. i = 0
  924. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  925. b = bytes('a' * i, encoding='UTF-8')
  926. golden = np.frombuffer(b, dtype=np.uint8)
  927. np.testing.assert_array_equal(item["data"], golden)
  928. i = i + 1
  929. assert_generator_single_input_3(generator_bytes)
  930. assert_generator_single_input_3(RandomAccessDatasetInner())
  931. assert_generator_single_input_3(SequentialAccessDataset())
  932. def test_generator_single_input_4():
  933. """
  934. Feature: Test single Tensor input
  935. Description: input Tensor
  936. Expectation: success
  937. """
  938. def generator_tensor():
  939. for i in range(64):
  940. yield Tensor(i)
  941. class RandomAccessDatasetInner:
  942. def __init__(self):
  943. self.__data = [Tensor(i) for i in range(64)]
  944. def __getitem__(self, item):
  945. return self.__data[item]
  946. def __len__(self):
  947. return 64
  948. class SequentialAccessDataset:
  949. def __init__(self):
  950. self.__data = [Tensor(i) for i in range(64)]
  951. self.__index = 0
  952. def __next__(self):
  953. if self.__index >= 64:
  954. raise StopIteration
  955. item = self.__data[self.__index]
  956. self.__index += 1
  957. return item
  958. def __iter__(self):
  959. self.__index = 0
  960. return self
  961. def __len__(self):
  962. return 64
  963. def assert_generator_single_input_4(data):
  964. # apply dataset operations
  965. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  966. i = 0
  967. for item in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary
  968. golden = Tensor(i)
  969. assert item["data"] == golden
  970. i = i + 1
  971. assert_generator_single_input_4(generator_tensor)
  972. assert_generator_single_input_4(RandomAccessDatasetInner())
  973. assert_generator_single_input_4(SequentialAccessDataset())
  974. def test_generator_single_input_5():
  975. """
  976. Feature: Test single np.array input
  977. Description: input np.array
  978. Expectation: success
  979. """
  980. def generator_np():
  981. for i in range(64):
  982. yield np.ones(i)
  983. class RandomAccessDatasetInner:
  984. def __init__(self):
  985. self.__data = [np.ones(i) for i in range(64)]
  986. def __getitem__(self, item):
  987. return self.__data[item]
  988. def __len__(self):
  989. return 64
  990. class SequentialAccessDataset:
  991. def __init__(self):
  992. self.__data = [np.ones(i) for i in range(64)]
  993. self.__index = 0
  994. def __next__(self):
  995. if self.__index >= 64:
  996. raise StopIteration
  997. item = self.__data[self.__index]
  998. self.__index += 1
  999. return item
  1000. def __iter__(self):
  1001. self.__index = 0
  1002. return self
  1003. def __len__(self):
  1004. return 64
  1005. def assert_generator_single_input_5(data):
  1006. # apply dataset operations
  1007. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  1008. i = 0
  1009. for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  1010. golden = np.ones(i)
  1011. np.testing.assert_array_equal(item["data"], golden)
  1012. i = i + 1
  1013. assert_generator_single_input_5(generator_np)
  1014. assert_generator_single_input_5(RandomAccessDatasetInner())
  1015. assert_generator_single_input_5(SequentialAccessDataset())
  1016. def test_generator_single_input_6():
  1017. """
  1018. Feature: Test single np.array input whose dtype is object
  1019. Description: input np.array
  1020. Expectation: throw exception
  1021. """
  1022. def generator_nested_np():
  1023. for i in range(64):
  1024. yield np.array([[i, i + 1], [i, i + 1, i + 2]])
  1025. class RandomAccessDatasetInner:
  1026. def __init__(self):
  1027. self.__data = [np.array([[i, i + 1], [i, i + 1, i + 2]]) for i in range(64)]
  1028. def __getitem__(self, item):
  1029. return self.__data[item]
  1030. def __len__(self):
  1031. return 64
  1032. class SequentialAccessDatasetInner:
  1033. def __init__(self):
  1034. self.__data = [np.array([[i, i + 1], [i, i + 1, i + 2]]) for i in range(64)]
  1035. self.__index = 0
  1036. def __next__(self):
  1037. if self.__index >= 64:
  1038. raise StopIteration
  1039. item = self.__data[self.__index]
  1040. self.__index += 1
  1041. return item
  1042. def __iter__(self):
  1043. self.__index = 0
  1044. return self
  1045. def __len__(self):
  1046. return 64
  1047. def assert_generator_single_input_6(data):
  1048. # apply dataset operations
  1049. with pytest.raises(RuntimeError) as info:
  1050. data1 = ds.GeneratorDataset(data, ["data"], shuffle=False)
  1051. for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary
  1052. pass
  1053. assert " Data type of the input or its converted Numpy array is expected" in str(info.value)
  1054. assert_generator_single_input_6(generator_nested_np)
  1055. assert_generator_single_input_6(RandomAccessDatasetInner())
  1056. assert_generator_single_input_6(SequentialAccessDatasetInner())
  1057. def test_generator_with_single_numpy():
  1058. """
  1059. Feature: Test GeneratorDataset with single numpy and multi columns when use __getitem__
  1060. Description: single numpy, tuple numpy with single columns and multi columns
  1061. Expectation: success
  1062. """
  1063. class get_dataset_generator:
  1064. def __init__(self, value):
  1065. np.random.seed(58)
  1066. self.__value = value
  1067. def __getitem__(self, index):
  1068. return self.__value
  1069. def __len__(self):
  1070. return 20
  1071. def test_generator_one_column(value):
  1072. number = np.random.randint(1, 4)
  1073. process_flag = False
  1074. if number > 1 and number % 2 == 0:
  1075. process_flag = True
  1076. dataset_generator = get_dataset_generator(value)
  1077. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False, num_parallel_workers=number,
  1078. python_multiprocessing=process_flag)
  1079. count = 0
  1080. for data in dataset.create_dict_iterator(output_numpy=True):
  1081. assert (data["data"] == value).all()
  1082. count += 1
  1083. assert count == 20
  1084. # test user define one column
  1085. numpy_1 = np.array(1)
  1086. numpy_2 = np.array([1])
  1087. numpy_3 = np.array([1, 2])
  1088. numpy_4 = np.array([1, 2, 3])
  1089. numpy_5 = np.array([[1], [2]])
  1090. numpy_6 = np.array([[1, 2], [2, 3]])
  1091. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1092. numpy_8 = np.array([[1], [2], [3]])
  1093. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1094. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1095. test_generator_one_column(numpy_1)
  1096. test_generator_one_column(numpy_2)
  1097. test_generator_one_column(numpy_3)
  1098. test_generator_one_column(numpy_4)
  1099. test_generator_one_column(numpy_5)
  1100. test_generator_one_column(numpy_6)
  1101. test_generator_one_column(numpy_7)
  1102. test_generator_one_column(numpy_8)
  1103. test_generator_one_column(numpy_9)
  1104. test_generator_one_column(numpy_10)
  1105. tuple_1 = (numpy_7,)
  1106. dataset_generator = get_dataset_generator(tuple_1)
  1107. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1108. count = 0
  1109. for data in dataset.create_dict_iterator(output_numpy=True):
  1110. assert (data["data"] == tuple_1[0]).all()
  1111. count += 1
  1112. assert count == 20
  1113. tuple_2 = (numpy_6, numpy_7)
  1114. with pytest.raises(RuntimeError) as info:
  1115. dataset_generator = get_dataset_generator(tuple_2)
  1116. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1117. for data in dataset.create_dict_iterator(output_numpy=True):
  1118. print(data["data"])
  1119. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1120. "column_names," in str(info.value)
  1121. assert "the size of column_names is:1 and number of returned NumPy array is:2" in str(info.value)
  1122. tuple_4 = (numpy_4, numpy_5, numpy_6, numpy_7)
  1123. with pytest.raises(RuntimeError) as info:
  1124. dataset_generator = get_dataset_generator(tuple_4)
  1125. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1126. for data in dataset.create_dict_iterator(output_numpy=True):
  1127. print(data["data"])
  1128. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1129. "column_names," in str(info.value)
  1130. assert "the size of column_names is:1 and number of returned NumPy array is:4" in str(info.value)
  1131. # test user define two column
  1132. def test_generator_two_column(value):
  1133. number = np.random.randint(1, 4)
  1134. process_flag = False
  1135. if number > 1 and number % 2 == 0:
  1136. process_flag = True
  1137. dataset_generator = get_dataset_generator(value)
  1138. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False, num_parallel_workers=number,
  1139. python_multiprocessing=process_flag)
  1140. count = 0
  1141. with pytest.raises(RuntimeError) as info:
  1142. for data in dataset.create_dict_iterator(output_numpy=True):
  1143. print(data)
  1144. count += 1
  1145. assert count == 20
  1146. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1147. "column_names," in str(info.value)
  1148. assert "the size of column_names is:2 and number of returned NumPy array is:1" in str(info.value)
  1149. numpy_1 = np.array(1)
  1150. numpy_2 = np.array([1])
  1151. numpy_3 = np.array([1, 2])
  1152. numpy_4 = np.array([1, 2, 3])
  1153. numpy_5 = np.array([[1], [2]])
  1154. numpy_6 = np.array([[1, 2], [2, 3]])
  1155. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1156. numpy_8 = np.array([[1], [2], [3]])
  1157. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1158. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1159. test_generator_two_column(numpy_1)
  1160. test_generator_two_column(numpy_2)
  1161. test_generator_two_column(numpy_3)
  1162. test_generator_two_column(numpy_4)
  1163. test_generator_two_column(numpy_5)
  1164. test_generator_two_column(numpy_6)
  1165. test_generator_two_column(numpy_7)
  1166. test_generator_two_column(numpy_8)
  1167. test_generator_two_column(numpy_9)
  1168. test_generator_two_column(numpy_10)
  1169. tuple_1 = (numpy_7,)
  1170. test_generator_two_column(tuple_1)
  1171. tuple_2 = (numpy_2, numpy_3)
  1172. dataset_generator = get_dataset_generator(tuple_2)
  1173. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1174. count = 0
  1175. for data in dataset.create_dict_iterator(output_numpy=True):
  1176. assert (data["data"] == numpy_2).all()
  1177. assert (data["label"] == numpy_3).all()
  1178. count += 1
  1179. assert count == 20
  1180. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1181. with pytest.raises(RuntimeError) as info:
  1182. dataset_generator = get_dataset_generator(tuple_3)
  1183. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1184. for data in dataset.create_dict_iterator(output_numpy=True):
  1185. print(data["data"])
  1186. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1187. "column_names," in str(info.value)
  1188. assert "the size of column_names is:2 and number of returned NumPy array is:3" in str(info.value)
  1189. # test user define three column
  1190. def test_generator_three_column(value):
  1191. number = np.random.randint(1, 4)
  1192. process_flag = False
  1193. if number > 1 and number % 2 == 0:
  1194. process_flag = True
  1195. dataset_generator = get_dataset_generator(value)
  1196. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False,
  1197. num_parallel_workers=number, python_multiprocessing=process_flag)
  1198. count = 0
  1199. with pytest.raises(RuntimeError) as info:
  1200. for data in dataset.create_dict_iterator(output_numpy=True):
  1201. print(data)
  1202. count += 1
  1203. assert count == 20
  1204. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1205. "column_names," in str(info.value)
  1206. assert "the size of column_names is:3 and number of returned NumPy array is:1" in str(info.value)
  1207. numpy_1 = np.array(1)
  1208. numpy_2 = np.array([1])
  1209. numpy_3 = np.array([1, 2])
  1210. numpy_4 = np.array([1, 2, 3])
  1211. numpy_5 = np.array([[1], [2]])
  1212. numpy_6 = np.array([[1, 2], [2, 3]])
  1213. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1214. numpy_8 = np.array([[1], [2], [3]])
  1215. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1216. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1217. test_generator_three_column(numpy_1)
  1218. test_generator_three_column(numpy_2)
  1219. test_generator_three_column(numpy_3)
  1220. test_generator_three_column(numpy_4)
  1221. test_generator_three_column(numpy_5)
  1222. test_generator_three_column(numpy_6)
  1223. test_generator_three_column(numpy_7)
  1224. test_generator_three_column(numpy_8)
  1225. test_generator_three_column(numpy_9)
  1226. test_generator_three_column(numpy_10)
  1227. tuple_1 = (numpy_7,)
  1228. test_generator_three_column(tuple_1)
  1229. tuple_2 = (numpy_2, numpy_3)
  1230. with pytest.raises(RuntimeError) as info:
  1231. dataset_generator = get_dataset_generator(tuple_2)
  1232. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1233. for data in dataset.create_dict_iterator(output_numpy=True):
  1234. print(data["data"])
  1235. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1236. "column_names," in str(info.value)
  1237. assert "the size of column_names is:3 and number of returned NumPy array is:2" in str(info.value)
  1238. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1239. dataset_generator = get_dataset_generator(tuple_3)
  1240. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1241. count = 0
  1242. for data in dataset.create_dict_iterator(output_numpy=True):
  1243. assert (data["data"] == numpy_4).all()
  1244. assert (data["label"] == numpy_5).all()
  1245. assert (data["label2"] == numpy_6).all()
  1246. count += 1
  1247. assert count == 20
  1248. def test_generator_with_single_numpy_with_next():
  1249. """
  1250. Feature: Test GeneratorDataset with single numpy and multi columns when use __next__
  1251. Description: single numpy, tuple numpy with single columns and multi columns
  1252. Expectation: success
  1253. """
  1254. class get_dataset_generator:
  1255. def __init__(self, value):
  1256. np.random.seed(58)
  1257. self.__value = value
  1258. self.__index = 0
  1259. def __next__(self):
  1260. if self.__index >= 20:
  1261. raise StopIteration
  1262. self.__index += 1
  1263. return self.__value
  1264. def __iter__(self):
  1265. self.__index = 0
  1266. return self
  1267. def __len__(self):
  1268. return 20
  1269. def test_generator_one_column(value):
  1270. number = np.random.randint(1, 4)
  1271. process_flag = False
  1272. if number > 1 and number % 2 == 0:
  1273. process_flag = True
  1274. dataset_generator = get_dataset_generator(value)
  1275. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False, num_parallel_workers=number,
  1276. python_multiprocessing=process_flag)
  1277. count = 0
  1278. for data in dataset.create_dict_iterator(output_numpy=True):
  1279. assert (data["data"] == value).all()
  1280. count += 1
  1281. assert count == 20
  1282. # test user define one column
  1283. numpy_1 = np.array(1)
  1284. numpy_2 = np.array([1])
  1285. numpy_3 = np.array([1, 2])
  1286. numpy_4 = np.array([1, 2, 3])
  1287. numpy_5 = np.array([[1], [2]])
  1288. numpy_6 = np.array([[1, 2], [2, 3]])
  1289. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1290. numpy_8 = np.array([[1], [2], [3]])
  1291. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1292. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1293. test_generator_one_column(numpy_1)
  1294. test_generator_one_column(numpy_2)
  1295. test_generator_one_column(numpy_3)
  1296. test_generator_one_column(numpy_4)
  1297. test_generator_one_column(numpy_5)
  1298. test_generator_one_column(numpy_6)
  1299. test_generator_one_column(numpy_7)
  1300. test_generator_one_column(numpy_8)
  1301. test_generator_one_column(numpy_9)
  1302. test_generator_one_column(numpy_10)
  1303. tuple_1 = (numpy_7,)
  1304. dataset_generator = get_dataset_generator(tuple_1)
  1305. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1306. count = 0
  1307. for data in dataset.create_dict_iterator(output_numpy=True):
  1308. assert (data["data"] == tuple_1[0]).all()
  1309. count += 1
  1310. assert count == 20
  1311. tuple_2 = (numpy_6, numpy_7)
  1312. with pytest.raises(RuntimeError) as info:
  1313. dataset_generator = get_dataset_generator(tuple_2)
  1314. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1315. for data in dataset.create_dict_iterator(output_numpy=True):
  1316. print(data["data"])
  1317. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1318. "column_names," in str(info.value)
  1319. assert "the size of column_names is:1 and number of returned NumPy array is:2" in str(info.value)
  1320. tuple_3 = (numpy_1, numpy_2)
  1321. with pytest.raises(RuntimeError) as info:
  1322. dataset_generator = get_dataset_generator(tuple_3)
  1323. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1324. for data in dataset.create_dict_iterator(output_numpy=True):
  1325. print(data["data"])
  1326. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1327. "column_names," in str(info.value)
  1328. assert "the size of column_names is:1 and number of returned NumPy array is:2" in str(info.value)
  1329. tuple_4 = (numpy_4, numpy_5, numpy_6, numpy_7)
  1330. with pytest.raises(RuntimeError) as info:
  1331. dataset_generator = get_dataset_generator(tuple_4)
  1332. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1333. for data in dataset.create_dict_iterator(output_numpy=True):
  1334. print(data["data"])
  1335. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1336. "column_names," in str(info.value)
  1337. assert "the size of column_names is:1 and number of returned NumPy array is:4" in str(info.value)
  1338. # test user define two column
  1339. def test_generator_two_column(value):
  1340. number = np.random.randint(1, 4)
  1341. process_flag = False
  1342. if number > 1 and number % 2 == 0:
  1343. process_flag = True
  1344. dataset_generator = get_dataset_generator(value)
  1345. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False, num_parallel_workers=number,
  1346. python_multiprocessing=process_flag)
  1347. count = 0
  1348. with pytest.raises(RuntimeError) as info:
  1349. for data in dataset.create_dict_iterator(output_numpy=True):
  1350. print(data)
  1351. count += 1
  1352. assert count == 20
  1353. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1354. "column_names," in str(info.value)
  1355. assert "the size of column_names is:2 and number of returned NumPy array is:1" in str(info.value)
  1356. numpy_1 = np.array(1)
  1357. numpy_2 = np.array([1])
  1358. numpy_3 = np.array([1, 2])
  1359. numpy_4 = np.array([1, 2, 3])
  1360. numpy_5 = np.array([[1], [2]])
  1361. numpy_6 = np.array([[1, 2], [2, 3]])
  1362. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1363. numpy_8 = np.array([[1], [2], [3]])
  1364. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1365. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1366. test_generator_two_column(numpy_1)
  1367. test_generator_two_column(numpy_2)
  1368. test_generator_two_column(numpy_3)
  1369. test_generator_two_column(numpy_4)
  1370. test_generator_two_column(numpy_5)
  1371. test_generator_two_column(numpy_6)
  1372. test_generator_two_column(numpy_7)
  1373. test_generator_two_column(numpy_8)
  1374. test_generator_two_column(numpy_9)
  1375. test_generator_two_column(numpy_10)
  1376. tuple_1 = (numpy_7,)
  1377. test_generator_two_column(tuple_1)
  1378. tuple_2 = (numpy_2, numpy_3)
  1379. dataset_generator = get_dataset_generator(tuple_2)
  1380. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1381. count = 0
  1382. for data in dataset.create_dict_iterator(output_numpy=True):
  1383. assert (data["data"] == numpy_2).all()
  1384. assert (data["label"] == numpy_3).all()
  1385. count += 1
  1386. assert count == 20
  1387. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1388. with pytest.raises(RuntimeError) as info:
  1389. dataset_generator = get_dataset_generator(tuple_3)
  1390. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1391. for data in dataset.create_dict_iterator(output_numpy=True):
  1392. print(data["data"])
  1393. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1394. "column_names," in str(info.value)
  1395. assert "the size of column_names is:2 and number of returned NumPy array is:3" in str(info.value)
  1396. # test user define three column
  1397. def test_generator_three_column(value):
  1398. number = np.random.randint(1, 4)
  1399. process_flag = False
  1400. if number > 1 and number % 2 == 0:
  1401. process_flag = True
  1402. dataset_generator = get_dataset_generator(value)
  1403. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False,
  1404. num_parallel_workers=number, python_multiprocessing=process_flag)
  1405. count = 0
  1406. with pytest.raises(RuntimeError) as info:
  1407. for data in dataset.create_dict_iterator(output_numpy=True):
  1408. print(data)
  1409. count += 1
  1410. assert count == 20
  1411. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1412. "column_names," in str(info.value)
  1413. assert "the size of column_names is:3 and number of returned NumPy array is:1" in str(info.value)
  1414. numpy_1 = np.array(1)
  1415. numpy_2 = np.array([1])
  1416. numpy_3 = np.array([1, 2])
  1417. numpy_4 = np.array([1, 2, 3])
  1418. numpy_5 = np.array([[1], [2]])
  1419. numpy_6 = np.array([[1, 2], [2, 3]])
  1420. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1421. numpy_8 = np.array([[1], [2], [3]])
  1422. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1423. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1424. test_generator_three_column(numpy_1)
  1425. test_generator_three_column(numpy_2)
  1426. test_generator_three_column(numpy_3)
  1427. test_generator_three_column(numpy_4)
  1428. test_generator_three_column(numpy_5)
  1429. test_generator_three_column(numpy_6)
  1430. test_generator_three_column(numpy_7)
  1431. test_generator_three_column(numpy_8)
  1432. test_generator_three_column(numpy_9)
  1433. test_generator_three_column(numpy_10)
  1434. tuple_1 = (numpy_7,)
  1435. test_generator_three_column(tuple_1)
  1436. tuple_2 = (numpy_2, numpy_3)
  1437. with pytest.raises(RuntimeError) as info:
  1438. dataset_generator = get_dataset_generator(tuple_2)
  1439. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1440. for data in dataset.create_dict_iterator(output_numpy=True):
  1441. print(data["data"])
  1442. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1443. "column_names," in str(info.value)
  1444. assert "the size of column_names is:3 and number of returned NumPy array is:2" in str(info.value)
  1445. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1446. dataset_generator = get_dataset_generator(tuple_3)
  1447. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1448. count = 0
  1449. for data in dataset.create_dict_iterator(output_numpy=True):
  1450. assert (data["data"] == numpy_4).all()
  1451. assert (data["label"] == numpy_5).all()
  1452. assert (data["label2"] == numpy_6).all()
  1453. count += 1
  1454. assert count == 20
  1455. def test_generator_with_single_numpy_with_yield():
  1456. """
  1457. Feature: Test GeneratorDataset with single numpy and multi columns when use yield
  1458. Description: single numpy, tuple numpy with single columns and multi columns
  1459. Expectation: success
  1460. """
  1461. def get_dataset_generator(value):
  1462. for _ in range(20):
  1463. yield value
  1464. def test_generator_one_column(value):
  1465. number = np.random.randint(1, 4)
  1466. process_flag = False
  1467. if number > 1 and number % 2 == 0:
  1468. process_flag = True
  1469. dataset_generator = get_dataset_generator(value)
  1470. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False, num_parallel_workers=number,
  1471. python_multiprocessing=process_flag)
  1472. count = 0
  1473. for data in dataset.create_dict_iterator(output_numpy=True):
  1474. assert (data["data"] == value).all()
  1475. count += 1
  1476. assert count == 20
  1477. # test user define one column
  1478. numpy_1 = np.array(1)
  1479. numpy_2 = np.array([1])
  1480. numpy_3 = np.array([1, 2])
  1481. numpy_4 = np.array([1, 2, 3])
  1482. numpy_5 = np.array([[1], [2]])
  1483. numpy_6 = np.array([[1, 2], [2, 3]])
  1484. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1485. numpy_8 = np.array([[1], [2], [3]])
  1486. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1487. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1488. test_generator_one_column(numpy_1)
  1489. test_generator_one_column(numpy_2)
  1490. test_generator_one_column(numpy_3)
  1491. test_generator_one_column(numpy_4)
  1492. test_generator_one_column(numpy_5)
  1493. test_generator_one_column(numpy_6)
  1494. test_generator_one_column(numpy_7)
  1495. test_generator_one_column(numpy_8)
  1496. test_generator_one_column(numpy_9)
  1497. test_generator_one_column(numpy_10)
  1498. tuple_1 = (numpy_7,)
  1499. dataset_generator = get_dataset_generator(tuple_1)
  1500. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1501. count = 0
  1502. for data in dataset.create_dict_iterator(output_numpy=True):
  1503. assert (data["data"] == tuple_1[0]).all()
  1504. count += 1
  1505. assert count == 20
  1506. tuple_2 = (numpy_6, numpy_7)
  1507. with pytest.raises(RuntimeError) as info:
  1508. dataset_generator = get_dataset_generator(tuple_2)
  1509. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1510. for data in dataset.create_dict_iterator(output_numpy=True):
  1511. print(data["data"])
  1512. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1513. "column_names," in str(info.value)
  1514. assert "the size of column_names is:1 and number of returned NumPy array is:2" in str(info.value)
  1515. tuple_3 = (numpy_1, numpy_2)
  1516. with pytest.raises(RuntimeError) as info:
  1517. dataset_generator = get_dataset_generator(tuple_3)
  1518. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1519. for data in dataset.create_dict_iterator(output_numpy=True):
  1520. print(data["data"])
  1521. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1522. "column_names," in str(info.value)
  1523. assert "the size of column_names is:1 and number of returned NumPy array is:2" in str(info.value)
  1524. tuple_4 = (numpy_4, numpy_5, numpy_6, numpy_7)
  1525. with pytest.raises(RuntimeError) as info:
  1526. dataset_generator = get_dataset_generator(tuple_4)
  1527. dataset = ds.GeneratorDataset(dataset_generator, ["data"], shuffle=False)
  1528. for data in dataset.create_dict_iterator(output_numpy=True):
  1529. print(data["data"])
  1530. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1531. "column_names," in str(info.value)
  1532. assert "the size of column_names is:1 and number of returned NumPy array is:4" in str(info.value)
  1533. # test user define two column
  1534. def test_generator_two_column(value):
  1535. number = np.random.randint(1, 4)
  1536. process_flag = False
  1537. if number > 1 and number % 2 == 0:
  1538. process_flag = True
  1539. dataset_generator = get_dataset_generator(value)
  1540. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False, num_parallel_workers=number,
  1541. python_multiprocessing=process_flag)
  1542. count = 0
  1543. with pytest.raises(RuntimeError) as info:
  1544. for data in dataset.create_dict_iterator(output_numpy=True):
  1545. print(data)
  1546. count += 1
  1547. assert count == 20
  1548. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1549. "column_names," in str(info.value)
  1550. assert "the size of column_names is:2 and number of returned NumPy array is:1" in str(info.value)
  1551. numpy_1 = np.array(1)
  1552. numpy_2 = np.array([1])
  1553. numpy_3 = np.array([1, 2])
  1554. numpy_4 = np.array([1, 2, 3])
  1555. numpy_5 = np.array([[1], [2]])
  1556. numpy_6 = np.array([[1, 2], [2, 3]])
  1557. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1558. numpy_8 = np.array([[1], [2], [3]])
  1559. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1560. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1561. test_generator_two_column(numpy_1)
  1562. test_generator_two_column(numpy_2)
  1563. test_generator_two_column(numpy_3)
  1564. test_generator_two_column(numpy_4)
  1565. test_generator_two_column(numpy_5)
  1566. test_generator_two_column(numpy_6)
  1567. test_generator_two_column(numpy_7)
  1568. test_generator_two_column(numpy_8)
  1569. test_generator_two_column(numpy_9)
  1570. test_generator_two_column(numpy_10)
  1571. tuple_1 = (numpy_7,)
  1572. test_generator_two_column(tuple_1)
  1573. tuple_2 = (numpy_2, numpy_3)
  1574. dataset_generator = get_dataset_generator(tuple_2)
  1575. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1576. count = 0
  1577. for data in dataset.create_dict_iterator(output_numpy=True):
  1578. assert (data["data"] == numpy_2).all()
  1579. assert (data["label"] == numpy_3).all()
  1580. count += 1
  1581. assert count == 20
  1582. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1583. with pytest.raises(RuntimeError) as info:
  1584. dataset_generator = get_dataset_generator(tuple_3)
  1585. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label"], shuffle=False)
  1586. for data in dataset.create_dict_iterator(output_numpy=True):
  1587. print(data["data"])
  1588. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1589. "column_names," in str(info.value)
  1590. assert "the size of column_names is:2 and number of returned NumPy array is:3" in str(info.value)
  1591. # test user define three column
  1592. def test_generator_three_column(value):
  1593. number = np.random.randint(1, 4)
  1594. process_flag = False
  1595. if number > 1 and number % 2 == 0:
  1596. process_flag = True
  1597. dataset_generator = get_dataset_generator(value)
  1598. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False,
  1599. num_parallel_workers=number, python_multiprocessing=process_flag)
  1600. count = 0
  1601. with pytest.raises(RuntimeError) as info:
  1602. for data in dataset.create_dict_iterator(output_numpy=True):
  1603. print(data)
  1604. count += 1
  1605. assert count == 20
  1606. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1607. "column_names," in str(info.value)
  1608. assert "the size of column_names is:3 and number of returned NumPy array is:1" in str(info.value)
  1609. numpy_1 = np.array(1)
  1610. numpy_2 = np.array([1])
  1611. numpy_3 = np.array([1, 2])
  1612. numpy_4 = np.array([1, 2, 3])
  1613. numpy_5 = np.array([[1], [2]])
  1614. numpy_6 = np.array([[1, 2], [2, 3]])
  1615. numpy_7 = np.array([[1, 2, 3], [2, 3, 4]])
  1616. numpy_8 = np.array([[1], [2], [3]])
  1617. numpy_9 = np.array([[1, 2], [2, 3], [3, 4]])
  1618. numpy_10 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
  1619. test_generator_three_column(numpy_1)
  1620. test_generator_three_column(numpy_2)
  1621. test_generator_three_column(numpy_3)
  1622. test_generator_three_column(numpy_4)
  1623. test_generator_three_column(numpy_5)
  1624. test_generator_three_column(numpy_6)
  1625. test_generator_three_column(numpy_7)
  1626. test_generator_three_column(numpy_8)
  1627. test_generator_three_column(numpy_9)
  1628. test_generator_three_column(numpy_10)
  1629. tuple_1 = (numpy_7,)
  1630. test_generator_three_column(tuple_1)
  1631. tuple_2 = (numpy_2, numpy_3)
  1632. with pytest.raises(RuntimeError) as info:
  1633. dataset_generator = get_dataset_generator(tuple_2)
  1634. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1635. for data in dataset.create_dict_iterator(output_numpy=True):
  1636. print(data["data"])
  1637. assert "the 'source' of 'GeneratorDataset' should return same number of NumPy arrays as specified in " \
  1638. "column_names," in str(info.value)
  1639. assert "the size of column_names is:3 and number of returned NumPy array is:2" in str(info.value)
  1640. tuple_3 = (numpy_4, numpy_5, numpy_6)
  1641. dataset_generator = get_dataset_generator(tuple_3)
  1642. dataset = ds.GeneratorDataset(dataset_generator, ["data", "label", "label2"], shuffle=False)
  1643. count = 0
  1644. for data in dataset.create_dict_iterator(output_numpy=True):
  1645. assert (data["data"] == numpy_4).all()
  1646. assert (data["label"] == numpy_5).all()
  1647. assert (data["label2"] == numpy_6).all()
  1648. count += 1
  1649. assert count == 20
  1650. if __name__ == "__main__":
  1651. test_generator_0()
  1652. test_generator_1()
  1653. test_generator_2()
  1654. test_generator_3()
  1655. test_generator_4()
  1656. test_generator_5()
  1657. test_generator_6()
  1658. test_generator_7()
  1659. test_generator_8()
  1660. test_generator_9()
  1661. test_generator_10()
  1662. test_generator_11()
  1663. test_generator_12()
  1664. test_generator_13()
  1665. test_generator_14()
  1666. test_generator_15()
  1667. test_generator_16()
  1668. test_generator_17()
  1669. test_generator_18()
  1670. test_generator_19()
  1671. test_generator_error_1()
  1672. test_generator_error_2()
  1673. test_generator_error_3()
  1674. test_generator_error_4()
  1675. test_generator_sequential_sampler()
  1676. test_generator_distributed_sampler()
  1677. test_generator_random_sampler()
  1678. test_generator_num_samples()
  1679. test_generator_num_samples_underflow()
  1680. test_generator_schema()
  1681. test_generator_dataset_size_0()
  1682. test_generator_dataset_size_1()
  1683. test_generator_dataset_size_2()
  1684. test_generator_dataset_size_3()
  1685. test_generator_dataset_size_4()
  1686. test_generator_dataset_size_5()
  1687. test_explicit_deepcopy()
  1688. test_func_generator_dataset_005()
  1689. test_func_generator_dataset_with_zip_source()
  1690. test_generator_mixed_operator()
  1691. test_generator_single_input_0()
  1692. test_generator_single_input_1()
  1693. test_generator_single_input_2()
  1694. test_generator_single_input_3()
  1695. test_generator_single_input_4()
  1696. test_generator_single_input_5()
  1697. test_generator_single_input_6()
  1698. test_generator_with_single_numpy()
  1699. test_generator_with_single_numpy_with_next()
  1700. test_generator_with_single_numpy_with_yield()