You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_profiling.py 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Testing profiling support in DE
  17. """
  18. import json
  19. import os
  20. import numpy as np
  21. import mindspore.common.dtype as mstype
  22. import mindspore.dataset as ds
  23. import mindspore.dataset.transforms.c_transforms as C
  24. import mindspore.dataset.vision.c_transforms as vision
  25. FILES = ["../data/dataset/testTFTestAllTypes/test.data"]
  26. DATASET_ROOT = "../data/dataset/testTFTestAllTypes/"
  27. SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
  28. PIPELINE_FILE = "./pipeline_profiling_1.json"
  29. CPU_UTIL_FILE = "./minddata_cpu_utilization_1.json"
  30. DATASET_ITERATOR_FILE = "./dataset_iterator_profiling_1.txt"
  31. def set_profiling_env_var():
  32. """
  33. Set the MindData Profiling environment variables
  34. """
  35. os.environ['PROFILING_MODE'] = 'true'
  36. os.environ['MINDDATA_PROFILING_DIR'] = '.'
  37. os.environ['DEVICE_ID'] = '1'
  38. def delete_profiling_files():
  39. """
  40. Delete the MindData profiling files generated from the test.
  41. Also disable the MindData Profiling environment variables.
  42. """
  43. # Delete MindData profiling files
  44. os.remove(PIPELINE_FILE)
  45. os.remove(CPU_UTIL_FILE)
  46. os.remove(DATASET_ITERATOR_FILE)
  47. # Disable MindData Profiling environment variables
  48. del os.environ['PROFILING_MODE']
  49. del os.environ['MINDDATA_PROFILING_DIR']
  50. del os.environ['DEVICE_ID']
  51. def confirm_cpuutil(num_pipeline_ops):
  52. """
  53. Confirm CPU utilization JSON file when <num_pipeline_ops> are in the pipeline
  54. """
  55. with open(CPU_UTIL_FILE) as file1:
  56. data = json.load(file1)
  57. op_info = data["op_info"]
  58. # Confirm <num_pipeline_ops>+1 ops in CPU util file (including op_id=-1 for monitor thread)
  59. assert len(op_info) == num_pipeline_ops + 1
  60. def test_profiling_simple_pipeline():
  61. """
  62. Generator -> Shuffle -> Batch
  63. """
  64. set_profiling_env_var()
  65. source = [(np.array([x]),) for x in range(1024)]
  66. data1 = ds.GeneratorDataset(source, ["data"])
  67. data1 = data1.shuffle(64)
  68. data1 = data1.batch(32)
  69. # try output shape type and dataset size and make sure no profiling file is generated
  70. assert data1.output_shapes() == [[32, 1]]
  71. assert [str(tp) for tp in data1.output_types()] == ["int64"]
  72. assert data1.get_dataset_size() == 32
  73. # Confirm profiling files do not (yet) exist
  74. assert os.path.exists(PIPELINE_FILE) is False
  75. assert os.path.exists(CPU_UTIL_FILE) is False
  76. assert os.path.exists(DATASET_ITERATOR_FILE) is False
  77. try:
  78. for _ in data1:
  79. pass
  80. # Confirm profiling files now exist
  81. assert os.path.exists(PIPELINE_FILE) is True
  82. assert os.path.exists(CPU_UTIL_FILE) is True
  83. assert os.path.exists(DATASET_ITERATOR_FILE) is True
  84. except Exception as error:
  85. delete_profiling_files()
  86. raise error
  87. else:
  88. delete_profiling_files()
  89. def test_profiling_complex_pipeline():
  90. """
  91. Generator -> Map ->
  92. -> Zip
  93. TFReader -> Shuffle ->
  94. """
  95. set_profiling_env_var()
  96. source = [(np.array([x]),) for x in range(1024)]
  97. data1 = ds.GeneratorDataset(source, ["gen"])
  98. data1 = data1.map(operations=[(lambda x: x + 1)], input_columns=["gen"])
  99. pattern = DATASET_ROOT + "/test.data"
  100. data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
  101. data2 = data2.shuffle(4)
  102. data3 = ds.zip((data1, data2))
  103. try:
  104. for _ in data3:
  105. pass
  106. with open(PIPELINE_FILE) as f:
  107. data = json.load(f)
  108. op_info = data["op_info"]
  109. assert len(op_info) == 5
  110. for i in range(5):
  111. if op_info[i]["op_type"] != "ZipOp":
  112. assert "size" in op_info[i]["metrics"]["output_queue"]
  113. assert "length" in op_info[i]["metrics"]["output_queue"]
  114. assert "throughput" in op_info[i]["metrics"]["output_queue"]
  115. else:
  116. # Note: Zip is an inline op and hence does not have metrics information
  117. assert op_info[i]["metrics"] is None
  118. # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
  119. confirm_cpuutil(5)
  120. except Exception as error:
  121. delete_profiling_files()
  122. raise error
  123. else:
  124. delete_profiling_files()
  125. def test_profiling_inline_ops_pipeline1():
  126. """
  127. Test pipeline with inline ops: Concat and EpochCtrl
  128. Generator ->
  129. Concat -> EpochCtrl
  130. Generator ->
  131. """
  132. set_profiling_env_var()
  133. # In source1 dataset: Number of rows is 3; its values are 0, 1, 2
  134. def source1():
  135. for i in range(3):
  136. yield (np.array([i]),)
  137. # In source2 dataset: Number of rows is 7; its values are 3, 4, 5 ... 9
  138. def source2():
  139. for i in range(3, 10):
  140. yield (np.array([i]),)
  141. data1 = ds.GeneratorDataset(source1, ["col1"])
  142. data2 = ds.GeneratorDataset(source2, ["col1"])
  143. data3 = data1.concat(data2)
  144. try:
  145. num_iter = 0
  146. # Note: Do not explicitly set num_epochs argument in create_tuple_iterator() call
  147. # Here i refers to index, d refers to data element
  148. for i, d in enumerate(data3.create_tuple_iterator(output_numpy=True)):
  149. num_iter += 1
  150. t = d
  151. assert i == t[0][0]
  152. assert num_iter == 10
  153. # Confirm pipeline is created with EpochCtrl op
  154. with open(PIPELINE_FILE) as f:
  155. data = json.load(f)
  156. op_info = data["op_info"]
  157. assert len(op_info) == 4
  158. for i in range(4):
  159. # Note: The following ops are inline ops: Concat, EpochCtrl
  160. if op_info[i]["op_type"] in ("ConcatOp", "EpochCtrlOp"):
  161. # Confirm these inline ops do not have metrics information
  162. assert op_info[i]["metrics"] is None
  163. else:
  164. assert "size" in op_info[i]["metrics"]["output_queue"]
  165. assert "length" in op_info[i]["metrics"]["output_queue"]
  166. assert "throughput" in op_info[i]["metrics"]["output_queue"]
  167. # Confirm CPU util JSON file content, when 4 ops are in the pipeline JSON file
  168. confirm_cpuutil(4)
  169. except Exception as error:
  170. delete_profiling_files()
  171. raise error
  172. else:
  173. delete_profiling_files()
  174. def test_profiling_inline_ops_pipeline2():
  175. """
  176. Test pipeline with many inline ops
  177. Generator -> Rename -> Skip -> Repeat -> Take
  178. """
  179. set_profiling_env_var()
  180. # In source1 dataset: Number of rows is 10; its values are 0, 1, 2, 3, 4, 5 ... 9
  181. def source1():
  182. for i in range(10):
  183. yield (np.array([i]),)
  184. data1 = ds.GeneratorDataset(source1, ["col1"])
  185. data1 = data1.rename(input_columns=["col1"], output_columns=["newcol1"])
  186. data1 = data1.skip(2)
  187. data1 = data1.repeat(2)
  188. data1 = data1.take(12)
  189. try:
  190. for _ in data1:
  191. pass
  192. with open(PIPELINE_FILE) as f:
  193. data = json.load(f)
  194. op_info = data["op_info"]
  195. assert len(op_info) == 5
  196. for i in range(5):
  197. # Check for these inline ops
  198. if op_info[i]["op_type"] in ("RenameOp", "RepeatOp", "SkipOp", "TakeOp"):
  199. # Confirm these inline ops do not have metrics information
  200. assert op_info[i]["metrics"] is None
  201. else:
  202. assert "size" in op_info[i]["metrics"]["output_queue"]
  203. assert "length" in op_info[i]["metrics"]["output_queue"]
  204. assert "throughput" in op_info[i]["metrics"]["output_queue"]
  205. # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
  206. confirm_cpuutil(5)
  207. except Exception as error:
  208. delete_profiling_files()
  209. raise error
  210. else:
  211. delete_profiling_files()
  212. def test_profiling_sampling_interval():
  213. """
  214. Test non-default monitor sampling interval
  215. """
  216. set_profiling_env_var()
  217. interval_origin = ds.config.get_monitor_sampling_interval()
  218. ds.config.set_monitor_sampling_interval(30)
  219. interval = ds.config.get_monitor_sampling_interval()
  220. assert interval == 30
  221. source = [(np.array([x]),) for x in range(1024)]
  222. data1 = ds.GeneratorDataset(source, ["data"])
  223. data1 = data1.shuffle(64)
  224. data1 = data1.batch(32)
  225. try:
  226. for _ in data1:
  227. pass
  228. except Exception as error:
  229. ds.config.set_monitor_sampling_interval(interval_origin)
  230. delete_profiling_files()
  231. raise error
  232. else:
  233. ds.config.set_monitor_sampling_interval(interval_origin)
  234. delete_profiling_files()
  235. def test_profiling_basic_pipeline():
  236. """
  237. Test with this basic pipeline
  238. Generator -> Map -> Batch -> Repeat -> EpochCtrl
  239. """
  240. set_profiling_env_var()
  241. def source1():
  242. for i in range(8000):
  243. yield (np.array([i]),)
  244. # Create this basic and common pipeline
  245. # Leaf/Source-Op -> Map -> Batch -> Repeat
  246. data1 = ds.GeneratorDataset(source1, ["col1"])
  247. type_cast_op = C.TypeCast(mstype.int32)
  248. data1 = data1.map(operations=type_cast_op, input_columns="col1")
  249. data1 = data1.batch(16)
  250. data1 = data1.repeat(2)
  251. try:
  252. num_iter = 0
  253. # Note: If create_tuple_iterator() is called with num_epochs>1, then EpochCtrlOp is added to the pipeline
  254. for _ in data1.create_dict_iterator(num_epochs=2):
  255. num_iter += 1
  256. assert num_iter == 1000
  257. with open(PIPELINE_FILE) as f:
  258. data = json.load(f)
  259. op_info = data["op_info"]
  260. assert len(op_info) == 5
  261. for i in range(5):
  262. # Check for inline ops
  263. if op_info[i]["op_type"] in ("EpochCtrlOp", "RepeatOp"):
  264. # Confirm these inline ops do not have metrics information
  265. assert op_info[i]["metrics"] is None
  266. else:
  267. assert "size" in op_info[i]["metrics"]["output_queue"]
  268. assert "length" in op_info[i]["metrics"]["output_queue"]
  269. assert "throughput" in op_info[i]["metrics"]["output_queue"]
  270. # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
  271. confirm_cpuutil(5)
  272. except Exception as error:
  273. delete_profiling_files()
  274. raise error
  275. else:
  276. delete_profiling_files()
  277. def test_profiling_cifar10_pipeline():
  278. """
  279. Test with this common pipeline with Cifar10
  280. Cifar10 -> Map -> Map -> Batch -> Repeat
  281. """
  282. set_profiling_env_var()
  283. # Create this common pipeline
  284. # Cifar10 -> Map -> Map -> Batch -> Repeat
  285. DATA_DIR_10 = "../data/dataset/testCifar10Data"
  286. data1 = ds.Cifar10Dataset(DATA_DIR_10, num_samples=8000)
  287. type_cast_op = C.TypeCast(mstype.int32)
  288. data1 = data1.map(operations=type_cast_op, input_columns="label")
  289. random_horizontal_op = vision.RandomHorizontalFlip()
  290. data1 = data1.map(operations=random_horizontal_op, input_columns="image")
  291. data1 = data1.batch(32)
  292. data1 = data1.repeat(3)
  293. try:
  294. num_iter = 0
  295. # Note: If create_tuple_iterator() is called with num_epochs=1, then EpochCtrlOp is NOT added to the pipeline
  296. for _ in data1.create_dict_iterator(num_epochs=1):
  297. num_iter += 1
  298. assert num_iter == 750
  299. with open(PIPELINE_FILE) as f:
  300. data = json.load(f)
  301. op_info = data["op_info"]
  302. assert len(op_info) == 5
  303. for i in range(5):
  304. # Check for inline ops
  305. if op_info[i]["op_type"] == "RepeatOp":
  306. # Confirm these inline ops do not have metrics information
  307. assert op_info[i]["metrics"] is None
  308. else:
  309. assert "size" in op_info[i]["metrics"]["output_queue"]
  310. assert "length" in op_info[i]["metrics"]["output_queue"]
  311. assert "throughput" in op_info[i]["metrics"]["output_queue"]
  312. # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
  313. confirm_cpuutil(5)
  314. except Exception as error:
  315. delete_profiling_files()
  316. raise error
  317. else:
  318. delete_profiling_files()
  319. def confirm_3ops_in_pipeline():
  320. with open(PIPELINE_FILE) as file1:
  321. data = json.load(file1)
  322. op_info = data["op_info"]
  323. # Confirm 3 ops in pipeline file
  324. assert len(op_info) == 3
  325. for i in range(3):
  326. assert op_info[i]["op_type"] in ("GeneratorOp", "BatchOp", "EpochCtrlOp")
  327. def confirm_2ops_in_pipeline():
  328. with open(PIPELINE_FILE) as file1:
  329. data = json.load(file1)
  330. op_info = data["op_info"]
  331. # Confirm 2 ops in pipeline file
  332. assert len(op_info) == 2
  333. for i in range(2):
  334. assert op_info[i]["op_type"] in ("GeneratorOp", "BatchOp")
  335. def test_profiling_epochctrl3():
  336. """
  337. Test with these 2 sequential pipelines:
  338. 1) Generator -> Batch -> EpochCtrl
  339. 2) Generator -> Batch
  340. Note: This is a simplification of the user scenario to use the same pipeline for training and then evaluation.
  341. """
  342. set_profiling_env_var()
  343. source = [(np.array([x]),) for x in range(64)]
  344. data1 = ds.GeneratorDataset(source, ["data"])
  345. data1 = data1.batch(32)
  346. try:
  347. # Test A - Call create_dict_iterator with num_epochs>1
  348. num_iter = 0
  349. # Note: If create_tuple_iterator() is called with num_epochs>1, then EpochCtrlOp is added to the pipeline
  350. for _ in data1.create_dict_iterator(num_epochs=2):
  351. num_iter += 1
  352. assert num_iter == 2
  353. confirm_3ops_in_pipeline()
  354. confirm_cpuutil(3)
  355. # Test B - Call create_dict_iterator with num_epochs=1
  356. num_iter = 0
  357. # Note: If create_tuple_iterator() is called with num_epochs=1,
  358. # then EpochCtrlOp should not be NOT added to the pipeline
  359. for _ in data1.create_dict_iterator(num_epochs=1):
  360. num_iter += 1
  361. assert num_iter == 2
  362. # confirm_2ops_in_pipeline()
  363. # MD BUG: Confirm pipeline file is not changed and wrongly still has 3 ops
  364. confirm_3ops_in_pipeline()
  365. # Confirm CPU util file has correct number of ops
  366. confirm_cpuutil(2)
  367. except Exception as error:
  368. delete_profiling_files()
  369. raise error
  370. else:
  371. delete_profiling_files()
  372. def test_profiling_epochctrl2():
  373. """
  374. Test with these 2 sequential pipelines:
  375. 1) Generator -> Batch
  376. 2) Generator -> Batch -> EpochCtrl
  377. """
  378. set_profiling_env_var()
  379. source = [(np.array([x]),) for x in range(64)]
  380. data2 = ds.GeneratorDataset(source, ["data"])
  381. data2 = data2.batch(16)
  382. try:
  383. # Test A - Call create_dict_iterator with num_epochs=1
  384. num_iter = 0
  385. # Note: If create_tuple_iterator() is called with num_epochs=1, then EpochCtrlOp is NOT added to the pipeline
  386. for _ in data2.create_dict_iterator(num_epochs=1):
  387. num_iter += 1
  388. assert num_iter == 4
  389. confirm_2ops_in_pipeline()
  390. confirm_cpuutil(2)
  391. # Test B - Call create_dict_iterator with num_epochs>1
  392. num_iter = 0
  393. # Note: If create_tuple_iterator() is called with num_epochs>1,
  394. # then EpochCtrlOp should be added to the pipeline
  395. for _ in data2.create_dict_iterator(num_epochs=2):
  396. num_iter += 1
  397. assert num_iter == 4
  398. # confirm_3ops_in_pipeline()
  399. # MD BUG: Confirm pipeline file is not changed and wrongly still has 2 ops
  400. confirm_2ops_in_pipeline()
  401. # Confirm CPU util file has correct number of ops
  402. confirm_cpuutil(3)
  403. except Exception as error:
  404. delete_profiling_files()
  405. raise error
  406. else:
  407. delete_profiling_files()
  408. if __name__ == "__main__":
  409. test_profiling_simple_pipeline()
  410. test_profiling_complex_pipeline()
  411. test_profiling_inline_ops_pipeline1()
  412. test_profiling_inline_ops_pipeline2()
  413. test_profiling_sampling_interval()
  414. test_profiling_basic_pipeline()
  415. test_profiling_cifar10_pipeline()
  416. test_profiling_epochctrl3()
  417. test_profiling_epochctrl2()