Browse Source

!15758 [Offline Debug] Add UT tests to the offline debugger.

From: @islam_amin
Reviewed-by: @john_tzanakakis,@tom__chen,@robingrosman
Signed-off-by: @john_tzanakakis
pull/15758/MERGE
mindspore-ci-bot Gitee 4 years ago
parent
commit
577cf7cd1d
22 changed files with 526 additions and 2 deletions
  1. +70
    -0
      tests/ut/data/dump/gpu_dumps/golden/sync_trans_false_read_tensors.expected
  2. +33
    -0
      tests/ut/data/dump/gpu_dumps/golden/sync_trans_false_watchpoints.expected
  3. +70
    -0
      tests/ut/data/dump/gpu_dumps/golden/sync_trans_true_read_tensors.expected
  4. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_0_shape_4_4_4_4_Float32_DefaultFormat.bin
  5. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_1_shape_256_UInt32_DefaultFormat.bin
  6. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--conv3-Conv2d--Conv2D-op308_output_0_shape_4_4_4_4_Float32_NCHW.bin
  7. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/conv2.bias_output_0_shape_128_Float32_DefaultFormat.bin
  8. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin
  9. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/moments.conv2.bias_output_0_shape_128_Float32_DefaultFormat.bin
  10. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/moments.fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin
  11. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_3/fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin
  12. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_3/moments.fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin
  13. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_0_shape_4_4_4_4_kNumberTypeFloat32_DefaultFormat.bin
  14. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_1_shape_256_kNumberTypeUInt32_DefaultFormat.bin
  15. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--conv3-Conv2d--Conv2D-op308_output_0_shape_4_4_4_4_kNumberTypeFloat32_DefaultFormat.bin
  16. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/conv2.bias_output_0_shape_128_kNumberTypeFloat32_DefaultFormat.bin
  17. BIN
      tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/moments.conv2.bias_output_0_shape_128_kNumberTypeFloat32_DefaultFormat.bin
  18. +29
    -0
      tests/ut/python/debugger/gpu_tests/dump_test_utils.py
  19. +89
    -0
      tests/ut/python/debugger/gpu_tests/test_sync_trans_false_read_tensors.py
  20. +130
    -0
      tests/ut/python/debugger/gpu_tests/test_sync_trans_false_watchpoints.py
  21. +89
    -0
      tests/ut/python/debugger/gpu_tests/test_sync_trans_read_tensors.py
  22. +16
    -2
      tests/ut/python/runtest.sh

+ 70
- 0
tests/ut/data/dump/gpu_dumps/golden/sync_trans_false_read_tensors.expected View File

@@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True

tensor_data_1 attributes:
data (printed in uint8) = [ 0 0 0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 58 196 248
194 127 0 0 17 0 0 0 0 0 0 0 160 76 6 140 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
64 195 195 248 194 127 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 88 1 196 248 194 127 0 0 18 0 0 0
0 0 0 0 160 47 6 140 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 176 203 195 248 194 127 0 0
176 204 195 248 194 127 0 0 0 0 0 0 0 0 0 0 216 241
195 248 194 127 0 0 19 0 0 0 0 0 0 0 96 39 6 140
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 112 52 196 248 194 127 0 0 176 52 196 248 194 127 0 0
0 0 0 0 0 0 0 0 88 250 195 248 194 127 0 0 20 0
0 0 0 0 0 0 128 130 5 140 195 127 0 0 69 0 0 0
0 0 0 0 0 0 0 0 195 127 0 0 208 136 195 248 194 127
0 0 176 202 195 248 194 127 0 0 48 52 196 248 194 127 0 0
184 247 195 248 194 127 0 0 21 0 0 0 0 0 0 0 176 213
4 140 195 127 0 0 69 0 0 0 0 0 0 0 0 0 0 0
195 127 0 0 48 52 196 248 194 127 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 8 249 195 248 194 127 0 0
22 0 0 0 0 0 0 0 16 46 4 140 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 64 137 195 248
194 127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 88 12 196 248 194 127 0 0 23 0 0 0 0 0 0 0
32 137 3 140 195 127 0 0 85 0 0 0 0 0 0 0 0 0
0 0 195 127 0 0 176 202 195 248 194 127 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 104 246 195 248 194 127
0 0 24 0 0 0 0 0 0 0 48 104 15 140 195 127 0 0
32 104 15 140 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False

tensor_data_2 attributes:
data (printed in uint8) = [ 0 0 0 ... 0 0 192]
size in bytes = 1024
debugger dtype = 11
shape = [4, 4, 4, 4]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False

tensor_data_3 attributes:
data (printed in uint8) = [ 0 169 0 ... 244 21 184]
size in bytes = 1024
debugger dtype = 8
shape = [256]

+ 33
- 0
tests/ut/data/dump/gpu_dumps/golden/sync_trans_false_watchpoints.expected View File

@@ -0,0 +1,33 @@
-----------------------------------------------------------
watchpoint_hit for test_1 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
condition = 6
watchpoint_id = 1
parameter 0 name = param
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = -2.0
error code = 0
device_id = 0
root_graph_id = 0
-----------------------------------------------------------
watchpoint_hit for test_4 attributes:
name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias
slot = 0
condition = 18
watchpoint_id = 3
parameter 0 name = abs_mean_update_ratio_gt
parameter 0 disabled = False
parameter 0 value = 0.0
parameter 0 hit = True
parameter 0 actual_value = 1.793662034335766e-35
parameter 1 name = epsilon
parameter 1 disabled = True
parameter 1 value = 0.0
parameter 1 hit = False
parameter 1 actual_value = 0.0
error code = 0
device_id = 0
root_graph_id = 0

+ 70
- 0
tests/ut/data/dump/gpu_dumps/golden/sync_trans_true_read_tensors.expected View File

@@ -0,0 +1,70 @@
-----------------------------------------------------------
tensor_info_1 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = True

tensor_data_1 attributes:
data (printed in uint8) = [ 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 40 186 117 65
195 127 0 0 5 0 0 0 0 0 0 0 160 76 6 204 195 127
0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127 0 0
48 135 117 65 195 127 0 0 16 58 118 65 195 127 0 0 144 58
118 65 195 127 0 0 168 186 117 65 195 127 0 0 6 0 0 0
0 0 0 0 160 47 6 204 195 127 0 0 69 0 0 0 0 0
0 0 1 0 0 0 195 127 0 0 80 58 118 65 195 127 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 184 249
117 65 195 127 0 0 7 0 0 0 0 0 0 0 96 39 6 204
195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0 195 127
0 0 224 218 117 65 195 127 0 0 0 0 0 0 0 0 0 0
224 219 117 65 195 127 0 0 200 17 118 65 195 127 0 0 8 0
0 0 0 0 0 0 128 130 5 204 195 127 0 0 69 0 0 0
0 0 0 0 1 0 0 0 195 127 0 0 120 233 255 59 196 127
0 0 224 217 117 65 195 127 0 0 224 214 117 65 195 127 0 0
120 250 117 65 195 127 0 0 9 0 0 0 0 0 0 0 176 213
4 204 195 127 0 0 69 0 0 0 0 0 0 0 1 0 0 0
195 127 0 0 240 66 118 65 195 127 0 0 160 218 117 65 195 127
0 0 224 215 117 65 195 127 0 0 40 9 118 65 195 127 0 0
10 0 0 0 0 0 0 0 16 46 4 204 195 127 0 0 69 0
0 0 0 0 0 0 1 0 0 0 195 127 0 0 208 59 118 65
195 127 0 0 0 0 0 0 0 0 0 0 96 218 117 65 195 127
0 0 56 251 117 65 195 127 0 0 11 0 0 0 0 0 0 0
32 137 3 204 195 127 0 0 85 0 0 0 0 0 0 0 1 0
0 0 195 127 0 0 224 214 117 65 195 127 0 0 144 59 118 65
195 127 0 0 160 214 117 65 195 127 0 0 136 62 118 65 195 127
0 0 12 0 0 0 0 0 0 0 48 104 15 204 195 127 0 0
32 104 15 204 195 127 0 0]
size in bytes = 512
debugger dtype = 11
shape = [128]
-----------------------------------------------------------
tensor_info_2 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308
slot = 0
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False

tensor_data_2 attributes:
data (printed in uint8) = [206 239 74 ... 76 157 184]
size in bytes = 1024
debugger dtype = 11
shape = [4, 4, 4, 4]
-----------------------------------------------------------
tensor_info_3 attributes:
node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300
slot = 1
iteration = 2
device_id = None
root_graph_id = 0
is_parameter = False

tensor_data_3 attributes:
data (printed in uint8) = [206 239 74 ... 76 157 184]
size in bytes = 1024
debugger dtype = 8
shape = [256]

BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_0_shape_4_4_4_4_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_1_shape_256_UInt32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--conv3-Conv2d--Conv2D-op308_output_0_shape_4_4_4_4_Float32_NCHW.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/conv2.bias_output_0_shape_128_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/moments.conv2.bias_output_0_shape_128_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_2/moments.fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_3/fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_false/alexnet/device_0/iteration_3/moments.fc3.bias_output_0_shape_10_Float32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_0_shape_4_4_4_4_kNumberTypeFloat32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--ReLUV2-op300_output_1_shape_256_kNumberTypeUInt32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/Default--network-WithLossCell--_backbone-AlexNet--conv3-Conv2d--Conv2D-op308_output_0_shape_4_4_4_4_kNumberTypeFloat32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/conv2.bias_output_0_shape_128_kNumberTypeFloat32_DefaultFormat.bin View File


BIN
tests/ut/data/dump/gpu_dumps/sync_trans_true/alexnet/device_0/iteration_2/moments.conv2.bias_output_0_shape_128_kNumberTypeFloat32_DefaultFormat.bin View File


+ 29
- 0
tests/ut/python/debugger/gpu_tests/dump_test_utils.py View File

@@ -0,0 +1,29 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Utils for testing offline debugger.
"""

import filecmp
import os


def compare_actual_with_expected(test_name):
"""Compare actual file with expected."""
is_eq = filecmp.cmp("../data/dump/gpu_dumps/golden/" +
test_name + ".expected", test_name + ".actual", shallow=False)
if os.path.exists(test_name + ".actual"):
os.remove(test_name + ".actual")
return is_eq

+ 89
- 0
tests/ut/python/debugger/gpu_tests/test_sync_trans_false_read_tensors.py View File

@@ -0,0 +1,89 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""

import mindspore.offline_debug.dbg_services as d
import numpy as np
from dump_test_utils import compare_actual_with_expected

GENERATE_GOLDEN = False
test_name = "sync_trans_false_read_tensors"


def test_sync_trans_false_read_tensors():

debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_false/alexnet")

_ = debugger_backend.initialize(
net_name="alexnet", is_sync_mode=True)

# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)

tensor_info = [info1, info2, info3]

tensor_data = debugger_backend.read_tensors(tensor_info)

print_read_tensors(tensor_info, tensor_data)
assert compare_actual_with_expected(test_name)


def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")

for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_parameter = " +
str(tensor_info[x].is_parameter) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x+1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n")
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
f_write.write("The python byte size of " + str(py_byte_size) +
" does not match the C++ byte size of " + str(c_byte_size) + "\n")
f_write.write("size in bytes = " +
str(tensor_data[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_data[x].shape) + "\n")
f_write.close()


if __name__ == "__main__":
test_sync_trans_false_read_tensors()

+ 130
- 0
tests/ut/python/debugger/gpu_tests/test_sync_trans_false_watchpoints.py View File

@@ -0,0 +1,130 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Watchpoints test script for offline debugger APIs.
"""

import mindspore.offline_debug.dbg_services as d
from dump_test_utils import compare_actual_with_expected

GENERATE_GOLDEN = False
test_name = "sync_trans_false_watchpoints"


def test_sync_trans_false_watchpoints():

if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")

debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_false/alexnet")

_ = debugger_backend.initialize(
net_name="Alexnet", is_sync_mode=True)

# NOTES:
# -> watch_condition=6 is MIN_LT
# -> watch_condition=18 is CHANGE_TOO_LARGE

# test 1: watchpoint set and hit (watch_condition=6)
param1 = d.Parameter(name="param", disabled=False, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param1])

watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2)
if len(watchpoint_hits_test_1) != 1:
f_write.write(
"ERROR -> test 1: watchpoint set but not hit just once\n")
print_watchpoint_hits(watchpoint_hits_test_1, 1, f_write)

# test 2: watchpoint remove and ensure it's not hit
_ = debugger_backend.remove_watchpoint(watchpoint_id=1)
watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_2:
f_write.write("ERROR -> test 2: watchpoint removed but hit\n")

# test 3: watchpoint set and not hit, then remove
param2 = d.Parameter(name="param", disabled=False, value=-1000.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/"
"Conv2D-op308":
{"device_id": [0], "root_graph_id": [0], "is_parameter": False
}}, parameter_list=[param2])

watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2)
if watchpoint_hits_test_3:
f_write.write(
"ERROR -> test 3: watchpoint set but not supposed to be hit\n")
_ = debugger_backend.remove_watchpoint(watchpoint_id=2)

# test 4: weight change watchpoint set and hit
param_abs_mean_update_ratio_gt = d.Parameter(
name="abs_mean_update_ratio_gt", disabled=False, value=0.0)
param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0)
_ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18,
check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/"
"Parameter[6]_11/fc3.bias":
{"device_id": [0], "root_graph_id": [0], "is_parameter": True
}}, parameter_list=[param_abs_mean_update_ratio_gt,
param_epsilon])

watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3)
if len(watchpoint_hits_test_4) != 1:
f_write.write(
"ERROR -> test 4: watchpoint weight change set but not hit just once\n")
print_watchpoint_hits(watchpoint_hits_test_4, 4, f_write)
f_write.close()
assert compare_actual_with_expected(test_name)


def print_watchpoint_hits(watchpoint_hits, test_id, f_write):
"""Print watchpoint hits."""
for x, _ in enumerate(watchpoint_hits):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("watchpoint_hit for test_%u attributes:" %
test_id + "\n")
f_write.write("name = " + str(watchpoint_hits[x].name) + "\n")
f_write.write("slot = " + str(watchpoint_hits[x].slot) + "\n")
f_write.write("condition = " +
str(watchpoint_hits[x].condition) + "\n")
f_write.write("watchpoint_id = " +
str(watchpoint_hits[x].watchpoint_id) + "\n")
for p, _ in enumerate(watchpoint_hits[x].parameters):
f_write.write("parameter " + str(p) + " name = " +
watchpoint_hits[x].parameters[p].name + "\n")
f_write.write("parameter " + str(p) + " disabled = " +
str(watchpoint_hits[x].parameters[p].disabled) + "\n")
f_write.write("parameter " + str(p) + " value = " +
str(watchpoint_hits[x].parameters[p].value) + "\n")
f_write.write("parameter " + str(p) + " hit = " +
str(watchpoint_hits[x].parameters[p].hit) + "\n")
f_write.write("parameter " + str(p) + " actual_value = " +
str(watchpoint_hits[x].parameters[p].actual_value) + "\n")
f_write.write("error code = " +
str(watchpoint_hits[x].error_code) + "\n")
f_write.write("device_id = " +
str(watchpoint_hits[x].device_id) + "\n")
f_write.write("root_graph_id = " +
str(watchpoint_hits[x].root_graph_id) + "\n")


if __name__ == "__main__":
test_sync_trans_false_watchpoints()

+ 89
- 0
tests/ut/python/debugger/gpu_tests/test_sync_trans_read_tensors.py View File

@@ -0,0 +1,89 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Read tensor test script for offline debugger APIs.
"""

import mindspore.offline_debug.dbg_services as d
import numpy as np
from dump_test_utils import compare_actual_with_expected

GENERATE_GOLDEN = False
test_name = "sync_trans_true_read_tensors"


def test_sync_trans_read_tensors():

debugger_backend = d.DbgServices(
dump_file_path="../data/dump/gpu_dumps/sync_trans_true/alexnet")

_ = debugger_backend.initialize(
net_name="Network Name goes here!", is_sync_mode=True)

# parameter
info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv2-Conv2d/conv2.bias",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=True)
# output tensor with zero slot
info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv3-Conv2d/Conv2D-op308",
slot=0, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)
# output tensor with non-zero slot
info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op300",
slot=1, iteration=2, device_id=0, root_graph_id=0, is_parameter=False)

tensor_info = [info1, info2, info3]

tensor_data = debugger_backend.read_tensors(tensor_info)

print_read_tensors(tensor_info, tensor_data)
assert compare_actual_with_expected(test_name)


def print_read_tensors(tensor_info, tensor_data):
"""Print read tensors."""
if GENERATE_GOLDEN:
f_write = open(test_name + ".expected", "w")
else:
f_write = open(test_name + ".actual", "w")

for x, _ in enumerate(tensor_info):
f_write.write(
"-----------------------------------------------------------\n")
f_write.write("tensor_info_" + str(x+1) + " attributes:\n")
f_write.write("node name = " + tensor_info[x].node_name + "\n")
f_write.write("slot = " + str(tensor_info[x].slot) + "\n")
f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n")
f_write.write("device_id = " + str(tensor_info[x].device_id) + "\n")
f_write.write("root_graph_id = " +
str(tensor_info[x].root_graph_id) + "\n")
f_write.write("is_parameter = " +
str(tensor_info[x].is_parameter) + "\n")
f_write.write("\n")
f_write.write("tensor_data_" + str(x+1) + " attributes:\n")
f_write.write("data (printed in uint8) = " + str(np.frombuffer(
tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n")
py_byte_size = len(tensor_data[x].data_ptr)
c_byte_size = tensor_data[x].data_size
if c_byte_size != py_byte_size:
f_write.write("The python byte size of " + str(py_byte_size) +
" does not match the C++ byte size of " + str(c_byte_size) + "\n")
f_write.write("size in bytes = " +
str(tensor_data[x].data_size) + "\n")
f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n")
f_write.write("shape = " + str(tensor_data[x].shape) + "\n")
f_write.close()


if __name__ == "__main__":
test_sync_trans_read_tensors()

+ 16
- 2
tests/ut/python/runtest.sh View File

@@ -39,6 +39,14 @@ if [ $# -eq 1 ] && ([ "$1" == "stage1" ] || [ "$1" == "stage2" ] || [ "$1" ==
exit ${RET}
fi

echo "run python debugger gpu ut"
pytest -v $CURRPATH/debugger/gpu_tests

RET=$?
if [ ${RET} -ne 0 ]; then
exit ${RET}
fi

elif [ $1 == "stage2" ]; then
echo "run python parallel"
pytest -s $CURRPATH/parallel/*.py
@@ -72,7 +80,7 @@ if [ $# -eq 1 ] && ([ "$1" == "stage1" ] || [ "$1" == "stage2" ] || [ "$1" ==
exit ${RET}
fi

pytest -v --ignore=$CURRPATH/dataset --ignore=$CURRPATH/parallel --ignore=$CURRPATH/ops --ignore=$CURRPATH/pynative_mode --ignore=$CURRPATH/pipeline --ignore=$CURRPATH/train --ignore=$CURRPATH/nn $IGNORE_EXEC $CURRPATH
pytest -v --ignore=$CURRPATH/dataset --ignore=$CURRPATH/debugger/gpu_tests --ignore=$CURRPATH/parallel --ignore=$CURRPATH/ops --ignore=$CURRPATH/pynative_mode --ignore=$CURRPATH/pipeline --ignore=$CURRPATH/train --ignore=$CURRPATH/nn $IGNORE_EXEC $CURRPATH

RET=$?
if [ ${RET} -ne 0 ]; then
@@ -87,6 +95,12 @@ else
exit ${RET}
fi

pytest $CURRPATH/debugger/gpu_tests
RET=$?
if [ ${RET} -ne 0 ]; then
exit ${RET}
fi

pytest -v $CURRPATH/parallel/*.py
RET=$?
if [ ${RET} -ne 0 ]; then
@@ -111,7 +125,7 @@ else
exit ${RET}
fi

pytest -v --ignore=$CURRPATH/dataset --ignore=$CURRPATH/parallel --ignore=$CURRPATH/ops --ignore=$CURRPATH/pynative_mode --ignore=$CURRPATH/pipeline --ignore=$CURRPATH/train --ignore=$CURRPATH/nn $IGNORE_EXEC $CURRPATH
pytest -v --ignore=$CURRPATH/dataset --ignore=$CURRPATH/debugger/gpu_tests --ignore=$CURRPATH/parallel --ignore=$CURRPATH/ops --ignore=$CURRPATH/pynative_mode --ignore=$CURRPATH/pipeline --ignore=$CURRPATH/train --ignore=$CURRPATH/nn $IGNORE_EXEC $CURRPATH
RET=$?
if [ ${RET} -ne 0 ]; then
exit ${RET}


Loading…
Cancel
Save