Browse Source

!26541 Fix file name and field type changes generated by HCCL in profiler.

Merge pull request !26541 from casgj/master_hccl
tags/v1.6.0
i-robot Gitee 4 years ago
parent
commit
875f35d6d8
5 changed files with 865 additions and 9 deletions
  1. +17
    -7
      mindspore/profiler/parser/hccl_parser.py
  2. +424
    -0
      tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1/iter1.trace
  3. +424
    -0
      tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1/iter2.trace
  4. +0
    -1
      tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1_dev6/iter1.trace
  5. +0
    -1
      tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1_dev6/iter2.trace

+ 17
- 7
mindspore/profiler/parser/hccl_parser.py View File

@@ -177,8 +177,7 @@ class HcclParser:
"""Get the name of communication operators mapping between hccl and step trace."""
dir_path = self._validate_dir_path(self._source_dir)
# The name of the operator in hccl is like:operatorName_{Ordered_number}_xx_xx.
operators_names_in_hccl = [entry.name for entry in os.scandir(dir_path) if entry.is_dir()
and entry.name.endswith(self._dev_id)]
operators_names_in_hccl = [entry.name for entry in os.scandir(dir_path) if entry.is_dir()]
operators_names_in_hccl_set = set({i.split('_')[0] for i in operators_names_in_hccl})
op_names_in_hccl_dic = dict()
for item in operators_names_in_hccl_set:
@@ -226,8 +225,7 @@ class HcclParser:
"""Obtain time-consuming information of all communication operators."""
operators_cost_info = dict()
dir_path = self._validate_dir_path(dir_path)
operators_dir = [entry.name for entry in os.scandir(dir_path) if entry.is_dir()
and entry.name.endswith(self._dev_id)]
operators_dir = [entry.name for entry in os.scandir(dir_path) if entry.is_dir()]
operator_dir_path = [os.path.join(dir_path, operator_dir) for operator_dir in operators_dir]
for operator_dir in operator_dir_path:
operator_cost = self._calculate_communication_operator_cost(operator_dir)
@@ -438,9 +436,16 @@ class HcclParser:
rdma_communication_time += rdma_send_cost + notify_record_cost + notify_wait_cost
rdma_communication_wait_time += notify_wait_cost
rdma_size = trace_event[start_index].get("args").get("size")
rdma_size = int(rdma_size, 16) if rdma_size else 0
if rdma_size:
rdma_size = rdma_size if isinstance(rdma_size, int) else int(rdma_size, 16)
else:
rdma_size = 0
notify_record_size = trace_event[start_index + 1].get("args").get("size")
notify_record_size = int(notify_record_size, 16) if notify_record_size else 0
if notify_record_size:
notify_record_size = notify_record_size if isinstance(notify_record_size, int) \
else int(notify_record_size, 16)
else:
notify_record_size = 0
rdma_communication_size += rdma_size + notify_record_size
start_index += 2
start_index += 1
@@ -470,7 +475,12 @@ class HcclParser:
task_type = item.get("args").get("task type")
if task_type in (CommunicationInfo.REDUCE_INLINE.value, CommunicationInfo.MEMCPY.value):
sdma_communication_time += item.get("dur", 0)
sdma_size = int(item.get("args").get("size"), 16) if item.get("args").get("size") else 0
sdma_size = item.get("args").get("size")
if sdma_size:
sdma_size = sdma_size if isinstance(sdma_size, int) else int(sdma_size, 16)
else:
sdma_size = 0

sdma_communication_size += sdma_size

# The unit of sdma_bandwidth is KB/s.


+ 424
- 0
tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1/iter1.trace View File

@@ -0,0 +1,424 @@
{
"device id": "6",
"iteration": 1,
"traceEvents": [
{
"tid": 2,
"pid": "6",
"ts": 616881090071.61,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 7,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881090071.97,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 8,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881090072.08,
"dur": 0.25,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.25,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 9,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881090072.44,
"dur": 0.0,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.0,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 10,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881090072.55,
"dur": 2183.31,
"ph": "X",
"name": "Reduce Inline",
"args": {
"notify id": 0,
"duration estimated": 2183.31,
"stage": "0",
"step": "0",
"bandwidth": 21.61,
"stream id": 24,
"task id": 11,
"task type": "Reduce Inline",
"src rank": 1,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092255.97,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 12,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092256.34,
"dur": 0.0,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.0,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 13,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092256.45,
"dur": 0.25,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.25,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 14,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092256.81,
"dur": 0.0,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.0,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 15,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092256.92,
"dur": 150.71,
"ph": "X",
"name": "Memcpy",
"args": {
"notify id": 0,
"duration estimated": 150.71,
"stage": "0",
"step": "0",
"bandwidth": 313.04,
"stream id": 24,
"task id": 16,
"task type": "Memcpy",
"src rank": 4294967295,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092407.73,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 17,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092408.09,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 18,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092408.2,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 19,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092408.56,
"dur": 0.0,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.0,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 20,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881092408.67,
"dur": 2182.35,
"ph": "X",
"name": "Memcpy",
"args": {
"notify id": 0,
"duration estimated": 2182.35,
"stage": "2",
"step": "0",
"bandwidth": 21.62,
"stream id": 24,
"task id": 21,
"task type": "Memcpy",
"src rank": 1,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881094591.12,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 22,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881094591.48,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 23,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881094591.59,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 24,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881094591.95,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 25,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
}
]
}

+ 424
- 0
tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1/iter2.trace View File

@@ -0,0 +1,424 @@
{
"device id": "6",
"iteration": 2,
"traceEvents": [
{
"tid": 2,
"pid": "6",
"ts": 616881123726.78,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 7,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881123727.14,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 8,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881123727.25,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 9,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881123727.61,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.01,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 10,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881123727.72,
"dur": 2186.19,
"ph": "X",
"name": "Reduce Inline",
"args": {
"notify id": 0,
"duration estimated": 2186.19,
"stage": "0",
"step": "0",
"bandwidth": 21.58,
"stream id": 24,
"task id": 11,
"task type": "Reduce Inline",
"src rank": 1,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881125914.01,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 12,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881125914.37,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 13,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881125914.48,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.26,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 14,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881125914.84,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.01,
"stage": "0",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 15,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881125914.95,
"dur": 150.81,
"ph": "X",
"name": "Memcpy",
"args": {
"notify id": 0,
"duration estimated": 150.81,
"stage": "0",
"step": "0",
"bandwidth": 312.83,
"stream id": 24,
"task id": 16,
"task type": "Memcpy",
"src rank": 4294967295,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881126065.86,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 17,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881126066.22,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 18,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881126066.33,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 19,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881126066.7,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 20,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881126066.85,
"dur": 2182.26,
"ph": "X",
"name": "Memcpy",
"args": {
"notify id": 0,
"duration estimated": 2182.26,
"stage": "2",
"step": "0",
"bandwidth": 21.62,
"stream id": 24,
"task id": 21,
"task type": "Memcpy",
"src rank": 1,
"dst rank": 0,
"transport type": "SDMA",
"size": 47178496
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881128249.22,
"dur": 0.26,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000090",
"duration estimated": 0.26,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 22,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881128249.59,
"dur": 0.0,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000090",
"duration estimated": 0.0,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 23,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881128249.71,
"dur": 0.25,
"ph": "X",
"name": "Notify Record",
"args": {
"notify id": "0x0000000100000050",
"duration estimated": 0.25,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 24,
"task type": "Notify Record",
"src rank": 0,
"dst rank": 1,
"transport type": "SDMA",
"size": null
}
},
{
"tid": 2,
"pid": "6",
"ts": 616881128250.07,
"dur": 0.01,
"ph": "X",
"name": "Notify Wait",
"args": {
"notify id": "0x0000000000000050",
"duration estimated": 0.01,
"stage": "2",
"step": "0",
"bandwidth": "NULL",
"stream id": 24,
"task id": 25,
"task type": "Notify Wait",
"src rank": 1,
"dst rank": 0,
"transport type": "LOCAL",
"size": null
}
}
]
}

+ 0
- 1
tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1_dev6/iter1.trace
File diff suppressed because it is too large
View File


+ 0
- 1
tests/ut/data/profiler_data/profiler/hccl_info/allReduce_1_1_dev6/iter2.trace
File diff suppressed because it is too large
View File


Loading…
Cancel
Save