You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_minddata_analyzer.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """
  16. Test MindData Profiling Analyzer Support
  17. """
  18. import csv
  19. import json
  20. import os
  21. import numpy as np
  22. import pytest
  23. import mindspore.common.dtype as mstype
  24. import mindspore.dataset as ds
  25. import mindspore.dataset.transforms.c_transforms as C
  26. import mindspore._c_dataengine as cde
  27. from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
  28. @pytest.mark.forked
  29. class TestMinddataProfilingAnalyzer:
  30. """
  31. Test the MinddataProfilingAnalyzer class
  32. Note: Use pytest fixture tmp_path to create files within this temporary directory,
  33. which is automatically created for each test and deleted at the end of the test.
  34. """
  35. def setup_class(self):
  36. """
  37. Run once for the class
  38. """
  39. # Get instance pointer for MindData profiling manager
  40. self.md_profiler = cde.GlobalContext.profiling_manager()
  41. # This is the set of keys for success case
  42. self._expected_summary_keys_success = \
  43. ['avg_cpu_pct', 'avg_cpu_pct_per_worker', 'children_ids', 'num_workers', 'op_ids', 'op_names',
  44. 'parent_id', 'per_batch_time', 'per_pipeline_time', 'per_push_queue_time', 'pipeline_ops',
  45. 'queue_average_size', 'queue_empty_freq_pct', 'queue_utilization_pct']
  46. def setup_method(self):
  47. """
  48. Run before each test function.
  49. """
  50. # Set the MindData Profiling related environment variables
  51. os.environ['RANK_ID'] = "7"
  52. os.environ['DEVICE_ID'] = "7"
  53. # Initialize MindData profiling manager
  54. self.md_profiler.init()
  55. # Start MindData Profiling
  56. self.md_profiler.start()
  57. def teardown_method(self):
  58. """
  59. Run after each test function.
  60. """
  61. # Disable MindData Profiling related environment variables
  62. del os.environ['RANK_ID']
  63. del os.environ['DEVICE_ID']
  64. def get_csv_result(self, file_pathname):
  65. """
  66. Get result from the CSV file.
  67. Args:
  68. file_pathname (str): The CSV file pathname.
  69. Returns:
  70. list[list], the parsed CSV information.
  71. """
  72. result = []
  73. with open(file_pathname, 'r') as csvfile:
  74. csv_reader = csv.reader(csvfile)
  75. for row in csv_reader:
  76. result.append(row)
  77. return result
  78. def verify_md_summary(self, md_summary_dict, expected_summary_keys, output_dir):
  79. """
  80. Verify the content of the 3 variations of the MindData Profiling analyze summary output.
  81. """
  82. summary_json_file = output_dir + "/minddata_pipeline_summary_7.json"
  83. summary_csv_file = output_dir + "/minddata_pipeline_summary_7.csv"
  84. # Confirm MindData Profiling analyze summary files are created
  85. assert os.path.exists(summary_json_file) is True
  86. assert os.path.exists(summary_csv_file) is True
  87. # Build a list of the sorted returned keys
  88. summary_returned_keys = list(md_summary_dict.keys())
  89. summary_returned_keys.sort()
  90. # 1. Confirm expected keys are in returned keys
  91. for k in expected_summary_keys:
  92. assert k in summary_returned_keys
  93. # Read summary JSON file
  94. with open(summary_json_file) as f:
  95. summary_json_data = json.load(f)
  96. # Build a list of the sorted JSON keys
  97. summary_json_keys = list(summary_json_data.keys())
  98. summary_json_keys.sort()
  99. # 2a. Confirm expected keys are in JSON file keys
  100. for k in expected_summary_keys:
  101. assert k in summary_json_keys
  102. # 2b. Confirm returned dictionary keys are identical to JSON file keys
  103. np.testing.assert_array_equal(summary_returned_keys, summary_json_keys)
  104. # Read summary CSV file
  105. summary_csv_data = self.get_csv_result(summary_csv_file)
  106. # Build a list of the sorted CSV keys from the first column in the CSV file
  107. summary_csv_keys = []
  108. for x in summary_csv_data:
  109. summary_csv_keys.append(x[0])
  110. summary_csv_keys.sort()
  111. # 3a. Confirm expected keys are in the first column of the CSV file
  112. for k in expected_summary_keys:
  113. assert k in summary_csv_keys
  114. # 3b. Confirm returned dictionary keys are identical to CSV file first column keys
  115. np.testing.assert_array_equal(summary_returned_keys, summary_csv_keys)
  116. def mysource(self):
  117. """Source for data values"""
  118. for i in range(8000):
  119. yield (np.array([i]),)
  120. def test_analyze_basic(self, tmp_path):
  121. """
  122. Test MindData profiling analyze summary files exist with basic pipeline.
  123. Also test basic content (subset of keys and values) from the returned summary result.
  124. """
  125. # Create this basic and common linear pipeline
  126. # Generator -> Map -> Batch -> Repeat -> EpochCtrl
  127. data1 = ds.GeneratorDataset(self.mysource, ["col1"])
  128. type_cast_op = C.TypeCast(mstype.int32)
  129. data1 = data1.map(operations=type_cast_op, input_columns="col1")
  130. data1 = data1.batch(16)
  131. data1 = data1.repeat(2)
  132. num_iter = 0
  133. # Note: If create_tuple_iterator() is called with num_epochs>1, then EpochCtrlOp is added to the pipeline
  134. for _ in data1.create_dict_iterator(num_epochs=2):
  135. num_iter = num_iter + 1
  136. # Confirm number of rows returned
  137. assert num_iter == 1000
  138. # Stop MindData Profiling and save output files to current working directory
  139. self.md_profiler.stop()
  140. self.md_profiler.save(str(tmp_path))
  141. pipeline_file = str(tmp_path) + "/pipeline_profiling_7.json"
  142. cpu_util_file = str(tmp_path) + "/minddata_cpu_utilization_7.json"
  143. dataset_iterator_file = str(tmp_path) + "/dataset_iterator_profiling_7.txt"
  144. analyze_file_path = str(tmp_path) + "/"
  145. # Confirm MindData Profiling files are created
  146. assert os.path.exists(pipeline_file) is True
  147. assert os.path.exists(cpu_util_file) is True
  148. assert os.path.exists(dataset_iterator_file) is True
  149. # Call MindData Analyzer for generated MindData profiling files to generate MindData pipeline summary result
  150. md_analyzer = MinddataProfilingAnalyzer(analyze_file_path, "7", analyze_file_path)
  151. md_summary_dict = md_analyzer.analyze()
  152. # Verify MindData Profiling Analyze Summary output
  153. # Note: MindData Analyzer returns the result in 3 formats:
  154. # 1. returned dictionary
  155. # 2. JSON file
  156. # 3. CSV file
  157. self.verify_md_summary(md_summary_dict, self._expected_summary_keys_success, str(tmp_path))
  158. # 4. Verify non-variant values or number of values in the tested pipeline for certain keys
  159. # of the returned dictionary
  160. # Note: Values of num_workers are not tested since default may change in the future
  161. # Note: Values related to queue metrics are not tested since they may vary on different execution environments
  162. assert md_summary_dict["pipeline_ops"] == ["EpochCtrl(id=0)", "Repeat(id=1)", "Batch(id=2)", "Map(id=3)",
  163. "Generator(id=4)"]
  164. assert md_summary_dict["op_names"] == ["EpochCtrl", "Repeat", "Batch", "Map", "Generator"]
  165. assert md_summary_dict["op_ids"] == [0, 1, 2, 3, 4]
  166. assert len(md_summary_dict["num_workers"]) == 5
  167. assert len(md_summary_dict["queue_average_size"]) == 5
  168. assert len(md_summary_dict["queue_utilization_pct"]) == 5
  169. assert len(md_summary_dict["queue_empty_freq_pct"]) == 5
  170. assert md_summary_dict["children_ids"] == [[1], [2], [3], [4], []]
  171. assert md_summary_dict["parent_id"] == [-1, 0, 1, 2, 3]
  172. assert len(md_summary_dict["avg_cpu_pct"]) == 5
  173. def test_analyze_sequential_pipelines_invalid(self, tmp_path):
  174. """
  175. Test invalid scenario in which MinddataProfilingAnalyzer is called for two sequential pipelines.
  176. """
  177. # Create the pipeline
  178. # Generator -> Map -> Batch -> EpochCtrl
  179. data1 = ds.GeneratorDataset(self.mysource, ["col1"])
  180. type_cast_op = C.TypeCast(mstype.int32)
  181. data1 = data1.map(operations=type_cast_op, input_columns="col1")
  182. data1 = data1.batch(64)
  183. # Phase 1 - For the pipeline, call create_tuple_iterator with num_epochs>1
  184. # Note: This pipeline has 4 ops: Generator -> Map -> Batch -> EpochCtrl
  185. num_iter = 0
  186. # Note: If create_tuple_iterator() is called with num_epochs>1, then EpochCtrlOp is added to the pipeline
  187. for _ in data1.create_dict_iterator(num_epochs=2):
  188. num_iter = num_iter + 1
  189. # Confirm number of rows returned
  190. assert num_iter == 125
  191. # Stop MindData Profiling and save output files to current working directory
  192. self.md_profiler.stop()
  193. self.md_profiler.save(str(tmp_path))
  194. pipeline_file = str(tmp_path) + "/pipeline_profiling_7.json"
  195. cpu_util_file = str(tmp_path) + "/minddata_cpu_utilization_7.json"
  196. dataset_iterator_file = str(tmp_path) + "/dataset_iterator_profiling_7.txt"
  197. analyze_file_path = str(tmp_path) + "/"
  198. # Confirm MindData Profiling files are created
  199. assert os.path.exists(pipeline_file) is True
  200. assert os.path.exists(cpu_util_file) is True
  201. assert os.path.exists(dataset_iterator_file) is True
  202. # Phase 2 - For the pipeline, call create_tuple_iterator with num_epochs=1
  203. # Note: This pipeline has 3 ops: Generator -> Map -> Batch
  204. # Initialize and Start MindData profiling manager
  205. self.md_profiler.init()
  206. self.md_profiler.start()
  207. num_iter = 0
  208. # Note: If create_tuple_iterator() is called with num_epochs=1, then EpochCtrlOp is NOT added to the pipeline
  209. for _ in data1.create_dict_iterator(num_epochs=1):
  210. num_iter = num_iter + 1
  211. # Confirm number of rows returned
  212. assert num_iter == 125
  213. # Stop MindData Profiling and save output files to current working directory
  214. self.md_profiler.stop()
  215. self.md_profiler.save(str(tmp_path))
  216. # Confirm MindData Profiling files are created
  217. # Note: There is an MD bug in which which the pipeline file is not recreated;
  218. # it still has 4 ops instead of 3 ops
  219. assert os.path.exists(pipeline_file) is True
  220. assert os.path.exists(cpu_util_file) is True
  221. assert os.path.exists(dataset_iterator_file) is True
  222. # Call MindData Analyzer for generated MindData profiling files to generate MindData pipeline summary result
  223. md_analyzer = MinddataProfilingAnalyzer(analyze_file_path, "7", analyze_file_path)
  224. md_summary_dict = md_analyzer.analyze()
  225. # Verify MindData Profiling Analyze Summary output
  226. self.verify_md_summary(md_summary_dict, self._expected_summary_keys_success, str(tmp_path))
  227. # Confirm pipeline data contains info for 3 ops
  228. assert md_summary_dict["pipeline_ops"] == ["Batch(id=0)", "Map(id=1)", "Generator(id=2)"]
  229. # Verify CPU util data contains info for 3 ops
  230. assert len(md_summary_dict["avg_cpu_pct"]) == 3