| @@ -255,7 +255,7 @@ void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned | |||
| // add analyzed tensor to cache | |||
| if (!recheck) { | |||
| wp_lock_.lock(); | |||
| wp_id_cache_[tensor_name].insert(id); | |||
| (void)wp_id_cache_[tensor_name].insert(id); | |||
| wp_lock_.unlock(); | |||
| } | |||
| } | |||
| @@ -43,7 +43,7 @@ class DbgLogger { | |||
| if (dbg_log_path != nullptr) { | |||
| char abspath[PATH_MAX]; | |||
| if (sizeof(dbg_log_path) > PATH_MAX || NULL == realpath(dbg_log_path, abspath)) { | |||
| return; | |||
| std::cout << "ERROR: DbgLogger could not create real path"; | |||
| } | |||
| FILE *fp = freopen(abspath, "a", stdout); | |||
| if (fp == nullptr) { | |||
| @@ -0,0 +1,150 @@ | |||
| [ | |||
| { | |||
| "test1": { | |||
| "tensor_info": { | |||
| "node_name": "Default/Add-op4", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_base_info": { | |||
| "size_in_bytes": 24, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 3 | |||
| ] | |||
| }, | |||
| "tensor_stat_info": { | |||
| "size_in_bytes": 24, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 3 | |||
| ], | |||
| "is_bool": false, | |||
| "max_vaue": 10.0, | |||
| "min_value": -11.0, | |||
| "avg_value": 0.880000114440918, | |||
| "count": 6, | |||
| "neg_zero_count": 2, | |||
| "pos_zero_count": 3, | |||
| "nan_count": 0, | |||
| "neg_inf_count": 0, | |||
| "pos_inf_count": 0, | |||
| "zero_count": 1 | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "test2": { | |||
| "tensor_info": { | |||
| "node_name": "Default/Reciprocal-op3", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_base_info": { | |||
| "size_in_bytes": 40, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 5 | |||
| ] | |||
| }, | |||
| "tensor_stat_info": { | |||
| "size_in_bytes": 40, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 5 | |||
| ], | |||
| "is_bool": false, | |||
| "max_vaue": 1.0, | |||
| "min_value": 1.0, | |||
| "avg_value": 1.0, | |||
| "count": 10, | |||
| "neg_zero_count": 0, | |||
| "pos_zero_count": 2, | |||
| "nan_count": 0, | |||
| "neg_inf_count": 3, | |||
| "pos_inf_count": 5, | |||
| "zero_count": 0 | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "test3": { | |||
| "tensor_info": { | |||
| "node_name": "Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_base_info": { | |||
| "size_in_bytes": 20, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 5 | |||
| ] | |||
| }, | |||
| "tensor_stat_info": { | |||
| "size_in_bytes": 20, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 5 | |||
| ], | |||
| "is_bool": false, | |||
| "max_vaue": 1.9901361465454102, | |||
| "min_value": -2.175431728363037, | |||
| "avg_value": -0.6648297309875488, | |||
| "count": 5, | |||
| "neg_zero_count": 2, | |||
| "pos_zero_count": 1, | |||
| "nan_count": 2, | |||
| "neg_inf_count": 0, | |||
| "pos_inf_count": 0, | |||
| "zero_count": 0 | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "test4": { | |||
| "tensor_info": { | |||
| "node_name": "invalid_name_for_test", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_base_info": { | |||
| "size_in_bytes": 0, | |||
| "debugger_dtype": 0, | |||
| "shape": [] | |||
| }, | |||
| "tensor_stat_info": { | |||
| "size_in_bytes": 0, | |||
| "debugger_dtype": 0, | |||
| "shape": [], | |||
| "is_bool": false, | |||
| "max_vaue": -1.7976931348623157e+308, | |||
| "min_value": 1.7976931348623157e+308, | |||
| "avg_value": 0.0, | |||
| "count": 0, | |||
| "neg_zero_count": 0, | |||
| "pos_zero_count": 0, | |||
| "nan_count": 0, | |||
| "neg_inf_count": 0, | |||
| "pos_inf_count": 0, | |||
| "zero_count": 0 | |||
| } | |||
| } | |||
| } | |||
| ] | |||
| @@ -0,0 +1,315 @@ | |||
| [ | |||
| { | |||
| "tensor_1": { | |||
| "tensor_info": { | |||
| "node_name": "Default/CudnnUniformReal-op391", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": false | |||
| }, | |||
| "tensor_data": { | |||
| "data": [ | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 66, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 69 | |||
| ], | |||
| "size_in_bytes": 8, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2 | |||
| ] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_2": { | |||
| "tensor_info": { | |||
| "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| "slot": 1, | |||
| "iteration": 1, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": false | |||
| }, | |||
| "tensor_data": { | |||
| "data": [ | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 66, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 69, | |||
| 0, | |||
| 0, | |||
| 144, | |||
| 64, | |||
| 195, | |||
| 245, | |||
| 216, | |||
| 64, | |||
| 0, | |||
| 0, | |||
| 48, | |||
| 193 | |||
| ], | |||
| "size_in_bytes": 24, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 3 | |||
| ] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_3": { | |||
| "tensor_info": { | |||
| "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/gradConv2D/Conv2DBackpropFilter-op424", | |||
| "slot": 0, | |||
| "iteration": 1, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_data": { | |||
| "data": [ | |||
| 8, | |||
| 255, | |||
| 166, | |||
| 56, | |||
| 189, | |||
| 58, | |||
| 71, | |||
| 56, | |||
| 103, | |||
| 3, | |||
| 217, | |||
| 55, | |||
| 170, | |||
| 225, | |||
| 174, | |||
| 56, | |||
| 135, | |||
| 195, | |||
| 82, | |||
| 56, | |||
| 54, | |||
| 253, | |||
| 225, | |||
| 55, | |||
| 254, | |||
| 158, | |||
| 179, | |||
| 56, | |||
| 33, | |||
| 66, | |||
| 88, | |||
| 56, | |||
| 30, | |||
| 248, | |||
| 222, | |||
| 55, | |||
| 241, | |||
| 32, | |||
| 168, | |||
| 56, | |||
| 143, | |||
| 126, | |||
| 73, | |||
| 56, | |||
| 116, | |||
| 129, | |||
| 228, | |||
| 55, | |||
| 53, | |||
| 254, | |||
| 175, | |||
| 56, | |||
| 2, | |||
| 0, | |||
| 87, | |||
| 56, | |||
| 246, | |||
| 124, | |||
| 238, | |||
| 55, | |||
| 177, | |||
| 160, | |||
| 180, | |||
| 56, | |||
| 156, | |||
| 126, | |||
| 92, | |||
| 56, | |||
| 144, | |||
| 121, | |||
| 236, | |||
| 55, | |||
| 117, | |||
| 189, | |||
| 159, | |||
| 56, | |||
| 25, | |||
| 132, | |||
| 32, | |||
| 56, | |||
| 154, | |||
| 1, | |||
| 178, | |||
| 54, | |||
| 187, | |||
| 189, | |||
| 156, | |||
| 56, | |||
| 117, | |||
| 252, | |||
| 27, | |||
| 56, | |||
| 205, | |||
| 2, | |||
| 76, | |||
| 54, | |||
| 212, | |||
| 127, | |||
| 148, | |||
| 56, | |||
| 129, | |||
| 1, | |||
| 12, | |||
| 56, | |||
| 53, | |||
| 253, | |||
| 11, | |||
| 182 | |||
| ], | |||
| "size_in_bytes": 108, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 3, | |||
| 3, | |||
| 3 | |||
| ] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_4": { | |||
| "tensor_info": { | |||
| "node_name": "Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", | |||
| "slot": 1, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_data": { | |||
| "data": [ | |||
| 104, | |||
| 60, | |||
| 33, | |||
| 79, | |||
| 53, | |||
| 6, | |||
| 131, | |||
| 78, | |||
| 78, | |||
| 232, | |||
| 126, | |||
| 79, | |||
| 154, | |||
| 198, | |||
| 85, | |||
| 79, | |||
| 245, | |||
| 52, | |||
| 84, | |||
| 78, | |||
| 70, | |||
| 207, | |||
| 222, | |||
| 78 | |||
| ], | |||
| "size_in_bytes": 24, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 6 | |||
| ] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_5": { | |||
| "tensor_info": { | |||
| "node_name": "Default/Reciprocal-op3", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_data": { | |||
| "data": [ | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 63, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 255, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 127, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 255, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 127, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 127, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 63, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 255, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 127, | |||
| 0, | |||
| 0, | |||
| 128, | |||
| 127 | |||
| ], | |||
| "size_in_bytes": 40, | |||
| "debugger_dtype": 11, | |||
| "shape": [ | |||
| 2, | |||
| 5 | |||
| ] | |||
| } | |||
| } | |||
| } | |||
| ] | |||
| @@ -0,0 +1,56 @@ | |||
| [ | |||
| { | |||
| "tensor_1": { | |||
| "tensor_info": { | |||
| "node_name": "Default/CudnnUniformReal-op390", | |||
| "slot": 0, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": false | |||
| }, | |||
| "tensor_data": { | |||
| "data": [], | |||
| "size_in_bytes": 0, | |||
| "debugger_dtype": 0, | |||
| "shape": [] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_2": { | |||
| "tensor_info": { | |||
| "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| "slot": 1, | |||
| "iteration": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": false | |||
| }, | |||
| "tensor_data": { | |||
| "data": [], | |||
| "size_in_bytes": 0, | |||
| "debugger_dtype": 0, | |||
| "shape": [] | |||
| } | |||
| } | |||
| }, | |||
| { | |||
| "tensor_3": { | |||
| "tensor_info": { | |||
| "node_name": "Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| "slot": 1, | |||
| "iteration": 1, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0, | |||
| "is_output": true | |||
| }, | |||
| "tensor_data": { | |||
| "data": [], | |||
| "size_in_bytes": 0, | |||
| "debugger_dtype": 0, | |||
| "shape": [] | |||
| } | |||
| } | |||
| } | |||
| ] | |||
| @@ -1,79 +0,0 @@ | |||
| ----------------------------------------------------------- | |||
| tensor_info_1 attributes: | |||
| node name = Default/CudnnUniformReal-op391 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = False | |||
| tensor_data_1 attributes: | |||
| data (printed in uint8) = [ 0 0 0 66 0 0 128 69] | |||
| size in bytes = 8 | |||
| debugger dtype = 11 | |||
| shape = [2] | |||
| ----------------------------------------------------------- | |||
| tensor_info_2 attributes: | |||
| node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406 | |||
| slot = 1 | |||
| iteration = 1 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = False | |||
| tensor_data_2 attributes: | |||
| data (printed in uint8) = [ 0 0 0 0 0 0 0 66 0 0 128 69 0 0 144 64 195 245 | |||
| 216 64 0 0 48 193] | |||
| size in bytes = 24 | |||
| debugger dtype = 11 | |||
| shape = [2, 3] | |||
| ----------------------------------------------------------- | |||
| tensor_info_3 attributes: | |||
| node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/gradConv2D/Conv2DBackpropFilter-op424 | |||
| slot = 0 | |||
| iteration = 1 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_data_3 attributes: | |||
| data (printed in uint8) = [ 8 255 166 56 189 58 71 56 103 3 217 55 170 225 174 56 135 195 | |||
| 82 56 54 253 225 55 254 158 179 56 33 66 88 56 30 248 222 55 | |||
| 241 32 168 56 143 126 73 56 116 129 228 55 53 254 175 56 2 0 | |||
| 87 56 246 124 238 55 177 160 180 56 156 126 92 56 144 121 236 55 | |||
| 117 189 159 56 25 132 32 56 154 1 178 54 187 189 156 56 117 252 | |||
| 27 56 205 2 76 54 212 127 148 56 129 1 12 56 53 253 11 182] | |||
| size in bytes = 108 | |||
| debugger dtype = 11 | |||
| shape = [3, 3, 3] | |||
| ----------------------------------------------------------- | |||
| tensor_info_4 attributes: | |||
| node name = Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381 | |||
| slot = 1 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_data_4 attributes: | |||
| data (printed in uint8) = [104 60 33 79 53 6 131 78 78 232 126 79 154 198 85 79 245 52 | |||
| 84 78 70 207 222 78] | |||
| size in bytes = 24 | |||
| debugger dtype = 11 | |||
| shape = [6] | |||
| ----------------------------------------------------------- | |||
| tensor_info_5 attributes: | |||
| node name = Default/Reciprocal-op3 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_data_5 attributes: | |||
| data (printed in uint8) = [ 0 0 128 63 0 0 128 255 0 0 128 127 0 0 128 255 0 0 | |||
| 128 127 0 0 128 127 0 0 128 63 0 0 128 255 0 0 128 127 | |||
| 0 0 128 127] | |||
| size in bytes = 40 | |||
| debugger dtype = 11 | |||
| shape = [2, 5] | |||
| @@ -1,120 +0,0 @@ | |||
| ----------------------------------------------------------- | |||
| tensor_info_1 attributes: | |||
| node name = Default/Add-op4 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_base_info: | |||
| size in bytes = 24 | |||
| debugger dtype = 11 | |||
| shape = [2, 3] | |||
| tensor_stat_info: | |||
| size in bytes = 24 | |||
| debugger dtype = 11 | |||
| shape = [2, 3] | |||
| is_bool = False | |||
| max_value = 10.0 | |||
| min_value = -11.0 | |||
| avg_value = 0.880000114440918 | |||
| count = 6 | |||
| neg_zero_count = 2 | |||
| pos_zero_count = 3 | |||
| nan_count = 0 | |||
| neg_inf_count = 0 | |||
| pos_inf_count = 0 | |||
| zero_count = 1 | |||
| ----------------------------------------------------------- | |||
| tensor_info_2 attributes: | |||
| node name = Default/Reciprocal-op3 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_base_info: | |||
| size in bytes = 40 | |||
| debugger dtype = 11 | |||
| shape = [2, 5] | |||
| tensor_stat_info: | |||
| size in bytes = 40 | |||
| debugger dtype = 11 | |||
| shape = [2, 5] | |||
| is_bool = False | |||
| max_value = 1.0 | |||
| min_value = 1.0 | |||
| avg_value = 1.0 | |||
| count = 10 | |||
| neg_zero_count = 0 | |||
| pos_zero_count = 2 | |||
| nan_count = 0 | |||
| neg_inf_count = 3 | |||
| pos_inf_count = 5 | |||
| zero_count = 0 | |||
| ----------------------------------------------------------- | |||
| tensor_info_3 attributes: | |||
| node name = Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_base_info: | |||
| size in bytes = 20 | |||
| debugger dtype = 11 | |||
| shape = [5] | |||
| tensor_stat_info: | |||
| size in bytes = 20 | |||
| debugger dtype = 11 | |||
| shape = [5] | |||
| is_bool = False | |||
| max_value = 1.9901361465454102 | |||
| min_value = -2.175431728363037 | |||
| avg_value = -0.6648297309875488 | |||
| count = 5 | |||
| neg_zero_count = 2 | |||
| pos_zero_count = 1 | |||
| nan_count = 2 | |||
| neg_inf_count = 0 | |||
| pos_inf_count = 0 | |||
| zero_count = 0 | |||
| ----------------------------------------------------------- | |||
| tensor_info_4 attributes: | |||
| node name = invalid_name_for_test | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = True | |||
| tensor_base_info: | |||
| size in bytes = 0 | |||
| debugger dtype = 0 | |||
| shape = [] | |||
| tensor_stat_info: | |||
| size in bytes = 0 | |||
| debugger dtype = 0 | |||
| shape = [] | |||
| is_bool = False | |||
| max_value = -1.7976931348623157e+308 | |||
| min_value = 1.7976931348623157e+308 | |||
| avg_value = 0.0 | |||
| count = 0 | |||
| neg_zero_count = 0 | |||
| pos_zero_count = 0 | |||
| nan_count = 0 | |||
| neg_inf_count = 0 | |||
| pos_inf_count = 0 | |||
| zero_count = 0 | |||
| @@ -1,28 +0,0 @@ | |||
| ----------------------------------------------------------- | |||
| tensor_info_1 attributes: | |||
| node name = Default/CudnnUniformReal-op390 | |||
| slot = 0 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = False | |||
| tensor_data_1 attributes: | |||
| data (printed in uint8) = [] | |||
| size in bytes = 0 | |||
| debugger dtype = 0 | |||
| shape = [] | |||
| ----------------------------------------------------------- | |||
| tensor_info_2 attributes: | |||
| node name = Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406 | |||
| slot = 1 | |||
| iteration = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| is_output = False | |||
| tensor_data_2 attributes: | |||
| data (printed in uint8) = [] | |||
| size in bytes = 0 | |||
| debugger dtype = 0 | |||
| shape = [] | |||
| @@ -1,33 +0,0 @@ | |||
| ----------------------------------------------------------- | |||
| watchpoint_hit for test_1 attributes: | |||
| name = Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369 | |||
| slot = 1 | |||
| condition = 6 | |||
| watchpoint_id = 1 | |||
| parameter 0 name = param | |||
| parameter 0 disabled = False | |||
| parameter 0 value = 0.0 | |||
| parameter 0 hit = True | |||
| parameter 0 actual_value = -0.020966000854969025 | |||
| error code = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| ----------------------------------------------------------- | |||
| watchpoint_hit for test_4 attributes: | |||
| name = Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias | |||
| slot = 0 | |||
| condition = 18 | |||
| watchpoint_id = 3 | |||
| parameter 0 name = abs_mean_update_ratio_gt | |||
| parameter 0 disabled = False | |||
| parameter 0 value = 0.0 | |||
| parameter 0 hit = True | |||
| parameter 0 actual_value = 1.0156775705209766 | |||
| parameter 1 name = epsilon | |||
| parameter 1 disabled = True | |||
| parameter 1 value = 0.0 | |||
| parameter 1 hit = False | |||
| parameter 1 actual_value = 0.0 | |||
| error code = 0 | |||
| rank_id = 0 | |||
| root_graph_id = 0 | |||
| @@ -0,0 +1,77 @@ | |||
| [ | |||
| { | |||
| "watchpoint_hit1": { | |||
| "name": "Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", | |||
| "slot": 1, | |||
| "condition": 6, | |||
| "watchpoint_id": 1, | |||
| "paremeter": [ | |||
| { | |||
| "parameter0": { | |||
| "name": "param", | |||
| "disabled": false, | |||
| "value": 0.0, | |||
| "hit": true, | |||
| "actual_value": -0.020966000854969025 | |||
| } | |||
| } | |||
| ], | |||
| "error_code": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0 | |||
| } | |||
| }, | |||
| { | |||
| "watchpoint_hit2": { | |||
| "name": "Default/CudnnUniformReal-op391", | |||
| "slot": 0, | |||
| "condition": 6, | |||
| "watchpoint_id": 2, | |||
| "paremeter": [ | |||
| { | |||
| "parameter0": { | |||
| "name": "param", | |||
| "disabled": false, | |||
| "value": 10.0, | |||
| "hit": true, | |||
| "actual_value": -4096.0 | |||
| } | |||
| } | |||
| ], | |||
| "error_code": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0 | |||
| } | |||
| }, | |||
| { | |||
| "watchpoint_hit3": { | |||
| "name": "Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias", | |||
| "slot": 0, | |||
| "condition": 18, | |||
| "watchpoint_id": 3, | |||
| "paremeter": [ | |||
| { | |||
| "parameter0": { | |||
| "name": "abs_mean_update_ratio_gt", | |||
| "disabled": false, | |||
| "value": 0.0, | |||
| "hit": true, | |||
| "actual_value": 1.0156775705209766 | |||
| } | |||
| }, | |||
| { | |||
| "parameter1": { | |||
| "name": "epsilon", | |||
| "disabled": true, | |||
| "value": 0.0, | |||
| "hit": false, | |||
| "actual_value": 0.0 | |||
| } | |||
| } | |||
| ], | |||
| "error_code": 0, | |||
| "rank_id": 0, | |||
| "root_graph_id": 0 | |||
| } | |||
| } | |||
| ] | |||
| @@ -16,20 +16,11 @@ | |||
| Utils for testing offline debugger. | |||
| """ | |||
| import filecmp | |||
| import os | |||
| import tempfile | |||
| import numpy as np | |||
| def compare_actual_with_expected(test_name): | |||
| """Compare actual file with expected.""" | |||
| is_eq = filecmp.cmp("../data/dump/gpu_dumps/golden/" + | |||
| test_name + ".expected", test_name + ".actual", shallow=False) | |||
| if os.path.exists(test_name + ".actual"): | |||
| os.remove(test_name + ".actual") | |||
| return is_eq | |||
| def build_dump_structure(tensor_name_list, tensor_list, net_name, tensor_info_list): | |||
| """Build dump file structure from tensor_list.""" | |||
| temp_dir = tempfile.mkdtemp(prefix=net_name, dir="./") | |||
| @@ -0,0 +1,180 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Read tensor test script for offline debugger APIs. | |||
| """ | |||
| import os | |||
| import shutil | |||
| import json | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| class TestOfflineReadTensor: | |||
| """Test read tensor for offline debugger.""" | |||
| GENERATE_GOLDEN = False | |||
| test_name = "read_tensors" | |||
| tensor_json = [] | |||
| temp_dir = '' | |||
| @classmethod | |||
| def setup_class(cls): | |||
| """Init setup for offline read tensor test""" | |||
| # input tensor with zero slot | |||
| tensor1 = np.array([32.0, 4096.0], np.float32) | |||
| name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| # input tensor with non-zero slot | |||
| tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32) | |||
| name2 = "ReluGradV2.ReluGradV2-op406.0.0." | |||
| info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/" | |||
| "gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False) | |||
| # output tensor with zero slot | |||
| tensor3 = np.array([[[7.963e-05, 4.750e-05, 2.587e-05], | |||
| [8.339e-05, 5.025e-05, 2.694e-05], | |||
| [8.565e-05, 5.156e-05, 2.658e-05]], | |||
| [[8.017e-05, 4.804e-05, 2.724e-05], | |||
| [8.392e-05, 5.126e-05, 2.843e-05], | |||
| [8.613e-05, 5.257e-05, 2.819e-05]], | |||
| [[7.617e-05, 3.827e-05, 5.305e-06], | |||
| [7.474e-05, 3.719e-05, 3.040e-06], | |||
| [7.081e-05, 3.338e-05, -2.086e-06]]], np.float32) | |||
| name3 = "Conv2DBackpropFilter.Conv2DBackpropFilter-op424.0.0." | |||
| info3 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/" | |||
| "gradConv2D/Conv2DBackpropFilter-op424", | |||
| slot=0, iteration=1, rank_id=0, root_graph_id=0, is_output=True) | |||
| # output tensor with non-zero slot | |||
| tensor4 = np.array([2705090541, 1099111076, 4276637100, 3586562544, 890060077, 1869062900], np.float32) | |||
| name4 = "ReLUV2.ReLUV2-op381.0.0." | |||
| info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", | |||
| slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor_name = [name1, name2, name3, name4] | |||
| tensor_list = [tensor1, tensor2, tensor3, tensor4] | |||
| cls.tensor_info = [info1, info2, info3, info4] | |||
| cls.temp_dir = build_dump_structure(tensor_name, tensor_list, "Test", cls.tensor_info) | |||
| # inf tensor | |||
| inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], | |||
| [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32) | |||
| inf_name = "Reciprocal.Reciprocal-op3.0.0." | |||
| cls.inf_info = d.TensorInfo(node_name="Default/Reciprocal-op3", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| cls.inf_dir = build_dump_structure([inf_name], [inf_tensor], "Inf", [cls.inf_info]) | |||
| @classmethod | |||
| def teardown_class(cls): | |||
| """Run after test this class.""" | |||
| shutil.rmtree(cls.temp_dir) | |||
| shutil.rmtree(cls.inf_dir) | |||
| @security_off_wrap | |||
| def test_sync_read_tensors(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True) | |||
| tensor_data = debugger_backend.read_tensors(self.tensor_info) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors(self.tensor_info, tensor_data, 0, False) | |||
| else: | |||
| self.compare_expect_actual_result(self.tensor_info, tensor_data, 0) | |||
| @security_off_wrap | |||
| def test_sync_read_inf_tensors(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.inf_dir) | |||
| _ = debugger_backend.initialize(net_name="Inf", is_sync_mode=True) | |||
| tensor_data_inf = debugger_backend.read_tensors([self.inf_info]) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors([self.inf_info], tensor_data_inf, 4, False) | |||
| else: | |||
| self.compare_expect_actual_result([self.inf_info], tensor_data_inf, 4) | |||
| @security_off_wrap | |||
| def test_async_read_tensors(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False) | |||
| tensor_data = debugger_backend.read_tensors(self.tensor_info) | |||
| if not self.GENERATE_GOLDEN: | |||
| self.compare_expect_actual_result(self.tensor_info, tensor_data, 0) | |||
| @security_off_wrap | |||
| def test_async_read_inf_tensors(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.inf_dir) | |||
| _ = debugger_backend.initialize(net_name="Inf", is_sync_mode=False) | |||
| tensor_data_inf = debugger_backend.read_tensors([self.inf_info]) | |||
| if not self.GENERATE_GOLDEN: | |||
| self.compare_expect_actual_result([self.inf_info], tensor_data_inf, 4) | |||
| def compare_expect_actual_result(self, tensor_info_list, tensor_data_list, test_index): | |||
| """Compare actual result with golden file.""" | |||
| golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/", | |||
| self.test_name + "_expected.json")) | |||
| with open(golden_file) as f: | |||
| expected_list = json.load(f) | |||
| for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): | |||
| test_id = "tensor_"+ str(test_index+x+1) | |||
| info = expected_list[x+test_index][test_id] | |||
| assert tensor_info.node_name == info['tensor_info']['node_name'] | |||
| assert tensor_info.slot == info['tensor_info']['slot'] | |||
| assert tensor_info.iteration == info['tensor_info']['iteration'] | |||
| assert tensor_info.rank_id == info['tensor_info']['rank_id'] | |||
| assert tensor_info.root_graph_id == info['tensor_info']['root_graph_id'] | |||
| assert tensor_info.is_output == info['tensor_info']['is_output'] | |||
| actual_data = np.frombuffer( | |||
| tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() | |||
| assert actual_data == info['tensor_data']['data'] | |||
| assert tensor_data.data_size == info['tensor_data']['size_in_bytes'] | |||
| assert tensor_data.dtype == info['tensor_data']['debugger_dtype'] | |||
| assert tensor_data.shape == info['tensor_data']['shape'] | |||
| def print_read_tensors(self, tensor_info_list, tensor_data_list, test_index, is_print): | |||
| """Print read tensors result if GENERATE_GOLDEN is True.""" | |||
| for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): | |||
| tensor = "tensor_" + str(test_index+x+1) | |||
| data = np.frombuffer( | |||
| tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() | |||
| py_byte_size = len(tensor_data.data_ptr) | |||
| c_byte_size = tensor_data.data_size | |||
| if c_byte_size != py_byte_size: | |||
| print("The python byte size of " + str(py_byte_size) + | |||
| " does not match the C++ byte size of " + str(c_byte_size) + "\n") | |||
| self.tensor_json.append({ | |||
| tensor: { | |||
| 'tensor_info': { | |||
| 'node_name': tensor_info.node_name, | |||
| 'slot': tensor_info.slot, | |||
| 'iteration': tensor_info.iteration, | |||
| 'rank_id': tensor_info.rank_id, | |||
| 'root_graph_id': tensor_info.root_graph_id, | |||
| 'is_output': tensor_info.is_output | |||
| }, | |||
| 'tensor_data': { | |||
| 'data': data, | |||
| 'size_in_bytes': tensor_data.data_size, | |||
| 'debugger_dtype': tensor_data.dtype, | |||
| 'shape': tensor_data.shape | |||
| } | |||
| } | |||
| }) | |||
| if is_print: | |||
| with open(self.test_name + "_expected.json", "w") as dump_f: | |||
| json.dump(self.tensor_json, dump_f, indent=4, separators=(',', ': ')) | |||
| @@ -0,0 +1,164 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Read tensor test script for offline debugger APIs. | |||
| """ | |||
| import os | |||
| import json | |||
| import shutil | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| class TestOfflineReadNonExistTensor: | |||
| """Test reading non exist tensor for offline debugger""" | |||
| GENERATE_GOLDEN = False | |||
| test_name = "read_tensors_nonexist_node" | |||
| tensor_json = [] | |||
| temp_dir = '' | |||
| @classmethod | |||
| def setup_class(cls): | |||
| """Init setup for offline read tensor test""" | |||
| tensor1 = np.array([32.0, 4096.0], np.float32) | |||
| name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32) | |||
| name2 = "ReluGradV2.ReluGradV2-op406.0.0." | |||
| info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet" | |||
| "/gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor_name = [name1, name2] | |||
| tensor_info = [info1, info2] | |||
| tensor_list = [tensor1, tensor2] | |||
| cls.temp_dir = build_dump_structure(tensor_name, tensor_list, "Test", tensor_info) | |||
| @classmethod | |||
| def teardown_class(cls): | |||
| shutil.rmtree(cls.temp_dir) | |||
| @security_off_wrap | |||
| def test_read_tensors_wrong_op_name(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize( | |||
| net_name="Test", is_sync_mode=True) | |||
| # non-existing tensor with wrong op name | |||
| info_nonexist = d.TensorInfo(node_name="Default/CudnnUniformReal-op390", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor_data = debugger_backend.read_tensors([info_nonexist]) | |||
| # Check the length of tensor data | |||
| assert len(tensor_data) == 1 | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors([info_nonexist], tensor_data, 0, False) | |||
| else: | |||
| self.compare_expect_actual_result([info_nonexist], tensor_data, 0) | |||
| @security_off_wrap | |||
| def test_read_tensors_wrong_iteration(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize( | |||
| net_name="Test", is_sync_mode=True) | |||
| # non-existing tensor with wrong iteration number | |||
| info_nonexist = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/" | |||
| "gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor_data = debugger_backend.read_tensors([info_nonexist]) | |||
| assert len(tensor_data) == 1 | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors([info_nonexist], tensor_data, 1, True) | |||
| else: | |||
| self.compare_expect_actual_result([info_nonexist], tensor_data, 1) | |||
| @security_off_wrap | |||
| def test_read_tensors_wrong_is_output(self): | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize( | |||
| net_name="Test", is_sync_mode=True) | |||
| # non-existing tensor with wrong is_output | |||
| info_nonexist = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/" | |||
| "gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor_data = debugger_backend.read_tensors([info_nonexist]) | |||
| assert len(tensor_data) == 1 | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors([info_nonexist], tensor_data, 2, True) | |||
| else: | |||
| self.compare_expect_actual_result([info_nonexist], tensor_data, 2) | |||
| def compare_expect_actual_result(self, tensor_info_list, tensor_data_list, test_index): | |||
| """Compare actual result with golden file.""" | |||
| golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/", | |||
| self.test_name + "_expected.json")) | |||
| with open(golden_file) as f: | |||
| expected_list = json.load(f) | |||
| for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): | |||
| tensor_id = "tensor_"+ str(test_index+x+1) | |||
| info = expected_list[x+test_index][tensor_id] | |||
| assert tensor_info.node_name == info['tensor_info']['node_name'] | |||
| assert tensor_info.slot == info['tensor_info']['slot'] | |||
| assert tensor_info.iteration == info['tensor_info']['iteration'] | |||
| assert tensor_info.rank_id == info['tensor_info']['rank_id'] | |||
| assert tensor_info.root_graph_id == info['tensor_info']['root_graph_id'] | |||
| assert tensor_info.is_output == info['tensor_info']['is_output'] | |||
| actual_data = np.frombuffer( | |||
| tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() | |||
| assert actual_data == info['tensor_data']['data'] | |||
| assert tensor_data.data_size == info['tensor_data']['size_in_bytes'] | |||
| assert tensor_data.dtype == info['tensor_data']['debugger_dtype'] | |||
| assert tensor_data.shape == info['tensor_data']['shape'] | |||
| def print_read_tensors(self, tensor_info_list, tensor_data_list, test_index, is_print): | |||
| """Print read tensors result if GENERATE_GOLDEN is True.""" | |||
| for x, (tensor_info, tensor_data) in enumerate(zip(tensor_info_list, tensor_data_list)): | |||
| tensor = "tensor_" + str(test_index+x+1) | |||
| data = np.frombuffer( | |||
| tensor_data.data_ptr, np.uint8, tensor_data.data_size).tolist() | |||
| py_byte_size = len(tensor_data.data_ptr) | |||
| c_byte_size = tensor_data.data_size | |||
| if c_byte_size != py_byte_size: | |||
| print("The python byte size of " + str(py_byte_size) + | |||
| " does not match the C++ byte size of " + str(c_byte_size) + "\n") | |||
| self.tensor_json.append({ | |||
| tensor: { | |||
| 'tensor_info': { | |||
| 'node_name': tensor_info.node_name, | |||
| 'slot': tensor_info.slot, | |||
| 'iteration': tensor_info.iteration, | |||
| 'rank_id': tensor_info.rank_id, | |||
| 'root_graph_id': tensor_info.root_graph_id, | |||
| 'is_output': tensor_info.is_output | |||
| }, | |||
| 'tensor_data': { | |||
| 'data': data, | |||
| 'size_in_bytes': tensor_data.data_size, | |||
| 'debugger_dtype': tensor_data.dtype, | |||
| 'shape': tensor_data.shape | |||
| } | |||
| } | |||
| }) | |||
| if is_print: | |||
| with open(self.test_name + "_expected.json", "w") as dump_f: | |||
| json.dump(self.tensor_json, dump_f, indent=4, separators=(',', ': ')) | |||
| @@ -1,127 +0,0 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Read tensor test script for offline debugger APIs. | |||
| """ | |||
| import shutil | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import compare_actual_with_expected, build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| GENERATE_GOLDEN = False | |||
| test_name = "sync_read_tensors" | |||
| @security_off_wrap | |||
| def test_sync_trans_false_read_tensors(): | |||
| # input tensor with zero slot | |||
| tensor1 = np.array([32.0, 4096.0], np.float32) | |||
| name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| # input tensor with non-zero slot | |||
| tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32) | |||
| name2 = "ReluGradV2.ReluGradV2-op406.0.0." | |||
| info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False) | |||
| # output tensor with zero slot | |||
| tensor3 = np.array([[[7.963e-05, 4.750e-05, 2.587e-05], | |||
| [8.339e-05, 5.025e-05, 2.694e-05], | |||
| [8.565e-05, 5.156e-05, 2.658e-05]], | |||
| [[8.017e-05, 4.804e-05, 2.724e-05], | |||
| [8.392e-05, 5.126e-05, 2.843e-05], | |||
| [8.613e-05, 5.257e-05, 2.819e-05]], | |||
| [[7.617e-05, 3.827e-05, 5.305e-06], | |||
| [7.474e-05, 3.719e-05, 3.040e-06], | |||
| [7.081e-05, 3.338e-05, -2.086e-06]]], np.float32) | |||
| name3 = "Conv2DBackpropFilter.Conv2DBackpropFilter-op424.0.0." | |||
| info3 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/conv5-Conv2d/" | |||
| "gradConv2D/Conv2DBackpropFilter-op424", | |||
| slot=0, iteration=1, rank_id=0, root_graph_id=0, is_output=True) | |||
| # output tensor with non-zero slot | |||
| tensor4 = np.array([2705090541, 1099111076, 4276637100, 3586562544, 890060077, 1869062900], np.float32) | |||
| name4 = "ReLUV2.ReLUV2-op381.0.0." | |||
| info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/ReLUV2-op381", | |||
| slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| # inf tensor | |||
| inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32) | |||
| inf_name = "Reciprocal.Reciprocal-op3.0.0." | |||
| inf_info = d.TensorInfo(node_name="Default/Reciprocal-op3", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor_name = [name1, name2, name3, name4] | |||
| tensor_list = [tensor1, tensor2, tensor3, tensor4] | |||
| tensor_info = [info1, info2, info3, info4] | |||
| temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_info) | |||
| inf_dir = build_dump_structure([inf_name], [inf_tensor], "Inf", [inf_info]) | |||
| debugger_backend1 = d.DbgServices(dump_file_path=temp_dir) | |||
| _ = debugger_backend1.initialize(net_name="alexnet", is_sync_mode=True) | |||
| tensor_data = debugger_backend1.read_tensors(tensor_info) | |||
| debugger_backend2 = d.DbgServices(dump_file_path=inf_dir) | |||
| _ = debugger_backend2.initialize(net_name="Inf", is_sync_mode=True) | |||
| tensor_data_inf = debugger_backend2.read_tensors([inf_info]) | |||
| tensor_info.extend([inf_info]) | |||
| tensor_data.extend(tensor_data_inf) | |||
| shutil.rmtree(temp_dir) | |||
| shutil.rmtree(inf_dir) | |||
| print_read_tensors(tensor_info, tensor_data) | |||
| if not GENERATE_GOLDEN: | |||
| assert compare_actual_with_expected(test_name) | |||
| def print_read_tensors(tensor_info, tensor_data): | |||
| """Print read tensors.""" | |||
| if GENERATE_GOLDEN: | |||
| f_write = open(test_name + ".expected", "w") | |||
| else: | |||
| f_write = open(test_name + ".actual", "w") | |||
| for x, _ in enumerate(tensor_info): | |||
| f_write.write( | |||
| "-----------------------------------------------------------\n") | |||
| f_write.write("tensor_info_" + str(x+1) + " attributes:\n") | |||
| f_write.write("node name = " + tensor_info[x].node_name + "\n") | |||
| f_write.write("slot = " + str(tensor_info[x].slot) + "\n") | |||
| f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") | |||
| f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n") | |||
| f_write.write("root_graph_id = " + | |||
| str(tensor_info[x].root_graph_id) + "\n") | |||
| f_write.write("is_output = " + | |||
| str(tensor_info[x].is_output) + "\n") | |||
| f_write.write("\n") | |||
| f_write.write("tensor_data_" + str(x+1) + " attributes:\n") | |||
| f_write.write("data (printed in uint8) = " + str(np.frombuffer( | |||
| tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n") | |||
| py_byte_size = len(tensor_data[x].data_ptr) | |||
| c_byte_size = tensor_data[x].data_size | |||
| if c_byte_size != py_byte_size: | |||
| f_write.write("The python byte size of " + str(py_byte_size) + | |||
| " does not match the C++ byte size of " + str(c_byte_size) + "\n") | |||
| f_write.write("size in bytes = " + | |||
| str(tensor_data[x].data_size) + "\n") | |||
| f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n") | |||
| f_write.write("shape = " + str(tensor_data[x].shape) + "\n") | |||
| f_write.close() | |||
| if __name__ == "__main__": | |||
| test_sync_trans_false_read_tensors() | |||
| @@ -15,82 +15,178 @@ | |||
| """ | |||
| Read tensor base and statistics test script for offline debugger APIs. | |||
| """ | |||
| import os | |||
| import shutil | |||
| import json | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import compare_actual_with_expected, build_dump_structure | |||
| from dump_test_utils import build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| GENERATE_GOLDEN = False | |||
| test_name = "sync_read_tensors_base_stat" | |||
| @security_off_wrap | |||
| def test_sync_read_tensors_base_stat(): | |||
| value_tensor = np.array([[7.5, 8.56, -9.78], [10.0, -11.0, 0.0]], np.float32) | |||
| name1 = "Add.Add-op4.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/Add-op4", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32) | |||
| name2 = "Reciprocal.Reciprocal-op3.0.0." | |||
| info2 = d.TensorInfo(node_name="Default/Reciprocal-op3", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32) | |||
| name3 = "ReduceMean.ReduceMean-op92.0.0." | |||
| info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| invalid_tensor = np.array([[1.1, -2.2], [3.3, -4.4]], np.float32) | |||
| name4 = "Add.Add-op1.0.0." | |||
| info4 = d.TensorInfo(node_name="invalid_name_for_test", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor_info = [info1, info2, info3, info4] | |||
| test_path = build_dump_structure([name1, name2, name3, name4], | |||
| [value_tensor, inf_tensor, nan_tensor, invalid_tensor], | |||
| "Test", tensor_info) | |||
| debugger_backend = d.DbgServices( | |||
| dump_file_path=test_path, verbose=True) | |||
| _ = debugger_backend.initialize( | |||
| net_name="Test", is_sync_mode=True) | |||
| tensor_base_data_list = debugger_backend.read_tensor_base(tensor_info) | |||
| tensor_stat_data_list = debugger_backend.read_tensor_stats(tensor_info) | |||
| shutil.rmtree(test_path) | |||
| print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list) | |||
| if not GENERATE_GOLDEN: | |||
| assert compare_actual_with_expected(test_name) | |||
| def print_read_tensors(tensor_info, tensor_base_data_list, tensor_stat_data_list): | |||
| """Print read tensors info.""" | |||
| if GENERATE_GOLDEN: | |||
| f_write = open(test_name + ".expected", "w") | |||
| else: | |||
| f_write = open(test_name + ".actual", "w") | |||
| for x, (tensor_info_item, tensor_base, tensor_stat) in enumerate(zip(tensor_info, | |||
| tensor_base_data_list, | |||
| tensor_stat_data_list)): | |||
| f_write.write( | |||
| "-----------------------------------------------------------\n") | |||
| f_write.write("tensor_info_" + str(x+1) + " attributes:\n") | |||
| f_write.write("node name = " + tensor_info_item.node_name + "\n") | |||
| f_write.write("slot = " + str(tensor_info_item.slot) + "\n") | |||
| f_write.write("iteration = " + str(tensor_info_item.iteration) + "\n") | |||
| f_write.write("rank_id = " + str(tensor_info_item.rank_id) + "\n") | |||
| f_write.write("root_graph_id = " + | |||
| str(tensor_info_item.root_graph_id) + "\n") | |||
| f_write.write("is_output = " + | |||
| str(tensor_info_item.is_output) + "\n") | |||
| f_write.write("\n") | |||
| f_write.write("tensor_base_info:\n") | |||
| f_write.write(str(tensor_base) + "\n") | |||
| f_write.write("\n") | |||
| f_write.write("tensor_stat_info:\n") | |||
| f_write.write(str(tensor_stat) + '\n') | |||
| f_write.close() | |||
| class TestOfflineReadTensorBaseStat: | |||
| """Test read tensor base stat for offline debugger""" | |||
| GENERATE_GOLDEN = False | |||
| test_name = "read_tensors_base_stat" | |||
| tensor_json = [] | |||
| test_path = '' | |||
| @classmethod | |||
| def setup_class(cls): | |||
| """Init setup for offline read tensor test""" | |||
| value_tensor = np.array([[7.5, 8.56, -9.78], [10.0, -11.0, 0.0]], np.float32) | |||
| name1 = "Add.Add-op4.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/Add-op4", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| inf_tensor = np.array([[1., -np.inf, np.inf, -np.inf, np.inf], | |||
| [np.inf, 1., -np.inf, np.inf, np.inf]], np.float32) | |||
| name2 = "Reciprocal.Reciprocal-op3.0.0." | |||
| info2 = d.TensorInfo(node_name="Default/Reciprocal-op3", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| nan_tensor = np.array([-2.1754317, 1.9901361, np.nan, np.nan, -1.8091936], np.float32) | |||
| name3 = "ReduceMean.ReduceMean-op92.0.0." | |||
| info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-MockModel/ReduceMean-op92", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| invalid_tensor = np.array([[1.1, -2.2], [3.3, -4.4]], np.float32) | |||
| name4 = "Add.Add-op1.0.0." | |||
| info4 = d.TensorInfo(node_name="invalid_name_for_test", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=True) | |||
| cls.tensor_info_1 = [info1] | |||
| cls.tensor_info_2 = [info2] | |||
| cls.tensor_info_3 = [info3] | |||
| cls.tensor_info_4 = [info4] | |||
| cls.tensor_info = [info1, info2, info3, info4] | |||
| cls.test_path = build_dump_structure([name1, name2, name3, name4], | |||
| [value_tensor, inf_tensor, nan_tensor, invalid_tensor], | |||
| "Test", cls.tensor_info) | |||
| cls.debugger_backend = d.DbgServices(dump_file_path=cls.test_path, verbose=True) | |||
| _ = cls.debugger_backend.initialize(net_name="Test", is_sync_mode=True) | |||
| @classmethod | |||
| def teardown_class(cls): | |||
| shutil.rmtree(cls.test_path) | |||
| @security_off_wrap | |||
| def test_read_value_tensors_base_stat(self): | |||
| tensor_base_data_list = self.debugger_backend.read_tensor_base(self.tensor_info_1) | |||
| tensor_stat_data_list = self.debugger_backend.read_tensor_stats(self.tensor_info_1) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors(self.tensor_info_1, tensor_base_data_list, tensor_stat_data_list, 0, False) | |||
| else: | |||
| self.compare_expect_actual_result(self.tensor_info_1, tensor_base_data_list, tensor_stat_data_list, 0) | |||
| @security_off_wrap | |||
| def test_read_inf_tensors_base_stat(self): | |||
| tensor_base_data_list = self.debugger_backend.read_tensor_base(self.tensor_info_2) | |||
| tensor_stat_data_list = self.debugger_backend.read_tensor_stats(self.tensor_info_2) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors(self.tensor_info_2, tensor_base_data_list, tensor_stat_data_list, 1, False) | |||
| else: | |||
| self.compare_expect_actual_result(self.tensor_info_2, tensor_base_data_list, tensor_stat_data_list, 1) | |||
| @security_off_wrap | |||
| def test_read_nan_tensors_base_stat(self): | |||
| tensor_base_data_list = self.debugger_backend.read_tensor_base(self.tensor_info_3) | |||
| tensor_stat_data_list = self.debugger_backend.read_tensor_stats(self.tensor_info_3) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors(self.tensor_info_3, tensor_base_data_list, tensor_stat_data_list, 2, False) | |||
| else: | |||
| self.compare_expect_actual_result(self.tensor_info_3, tensor_base_data_list, tensor_stat_data_list, 2) | |||
| @security_off_wrap | |||
| def test_read_inv_tensors_base_stat(self): | |||
| tensor_base_data_list = self.debugger_backend.read_tensor_base(self.tensor_info_4) | |||
| tensor_stat_data_list = self.debugger_backend.read_tensor_stats(self.tensor_info_4) | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_read_tensors(self.tensor_info_4, tensor_base_data_list, tensor_stat_data_list, 3, True) | |||
| else: | |||
| self.compare_expect_actual_result(self.tensor_info_4, tensor_base_data_list, tensor_stat_data_list, 3) | |||
| def compare_expect_actual_result(self, tensor_info, tensor_base_data_list, tensor_stat_data_list, test_index): | |||
| """Compare actual result with golden file.""" | |||
| golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/", | |||
| self.test_name + "_expected.json")) | |||
| with open(golden_file) as f: | |||
| expected_list = json.load(f) | |||
| for x, (tensor_info_item, tensor_base, tensor_stat) in enumerate(zip(tensor_info, | |||
| tensor_base_data_list, | |||
| tensor_stat_data_list)): | |||
| test_id = "test"+ str(test_index+x+1) | |||
| info_json = expected_list[x+test_index][test_id]['tensor_info'] | |||
| base_json = expected_list[x+test_index][test_id]['tensor_base_info'] | |||
| stat_json = expected_list[x+test_index][test_id]['tensor_stat_info'] | |||
| assert tensor_info_item.node_name == info_json['node_name'] | |||
| assert tensor_info_item.slot == info_json['slot'] | |||
| assert tensor_info_item.iteration == info_json['iteration'] | |||
| assert tensor_info_item.rank_id == info_json['rank_id'] | |||
| assert tensor_info_item.root_graph_id == info_json['root_graph_id'] | |||
| assert tensor_info_item.is_output == info_json['is_output'] | |||
| assert tensor_base.data_size == base_json['size_in_bytes'] | |||
| assert tensor_base.dtype == base_json['debugger_dtype'] | |||
| assert tensor_base.shape == base_json['shape'] | |||
| assert tensor_stat.data_size == stat_json['size_in_bytes'] | |||
| assert tensor_stat.dtype == stat_json['debugger_dtype'] | |||
| assert tensor_stat.shape == stat_json['shape'] | |||
| assert tensor_stat.is_bool == stat_json['is_bool'] | |||
| assert tensor_stat.max_value == stat_json['max_vaue'] | |||
| assert tensor_stat.min_value == stat_json['min_value'] | |||
| assert tensor_stat.avg_value == stat_json['avg_value'] | |||
| assert tensor_stat.count == stat_json['count'] | |||
| assert tensor_stat.neg_zero_count == stat_json['neg_zero_count'] | |||
| assert tensor_stat.pos_zero_count == stat_json['pos_zero_count'] | |||
| assert tensor_stat.nan_count == stat_json['nan_count'] | |||
| assert tensor_stat.neg_inf_count == stat_json['neg_inf_count'] | |||
| assert tensor_stat.pos_inf_count == stat_json['pos_inf_count'] | |||
| assert tensor_stat.zero_count == stat_json['zero_count'] | |||
| def print_read_tensors(self, tensor_info, tensor_base_data_list, tensor_stat_data_list, test_index, is_print): | |||
| """Print read tensors info.""" | |||
| for x, (tensor_info_item, tensor_base, tensor_stat) in enumerate(zip(tensor_info, | |||
| tensor_base_data_list, | |||
| tensor_stat_data_list)): | |||
| test_name = "test" + str(test_index+x+1) | |||
| self.tensor_json.append({ | |||
| test_name: { | |||
| 'tensor_info': { | |||
| 'node_name': tensor_info_item.node_name, | |||
| 'slot': tensor_info_item.slot, | |||
| 'iteration': tensor_info_item.iteration, | |||
| 'rank_id': tensor_info_item.rank_id, | |||
| 'root_graph_id': tensor_info_item.root_graph_id, | |||
| 'is_output': tensor_info_item.is_output | |||
| }, | |||
| 'tensor_base_info': { | |||
| 'size_in_bytes': tensor_base.data_size, | |||
| 'debugger_dtype': tensor_base.dtype, | |||
| 'shape': tensor_base.shape | |||
| }, | |||
| 'tensor_stat_info': { | |||
| 'size_in_bytes': tensor_stat.data_size, | |||
| 'debugger_dtype': tensor_stat.dtype, | |||
| 'shape': tensor_stat.shape, | |||
| 'is_bool': tensor_stat.is_bool, | |||
| 'max_vaue': tensor_stat.max_value, | |||
| 'min_value': tensor_stat.min_value, | |||
| 'avg_value': tensor_stat.avg_value, | |||
| 'count': tensor_stat.count, | |||
| 'neg_zero_count': tensor_stat.neg_zero_count, | |||
| 'pos_zero_count': tensor_stat.pos_zero_count, | |||
| 'nan_count': tensor_stat.nan_count, | |||
| 'neg_inf_count': tensor_stat.neg_inf_count, | |||
| 'pos_inf_count': tensor_stat.pos_inf_count, | |||
| 'zero_count': tensor_stat.zero_count | |||
| } | |||
| }}) | |||
| if is_print: | |||
| with open(self.test_name + "_expected.json", "w") as dump_f: | |||
| json.dump(self.tensor_json, dump_f, indent=4, separators=(',', ': ')) | |||
| @@ -1,107 +0,0 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Read tensor test script for offline debugger APIs. | |||
| """ | |||
| import shutil | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import compare_actual_with_expected, build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| GENERATE_GOLDEN = False | |||
| test_name = "sync_read_tensors_nonexist_node" | |||
| @security_off_wrap | |||
| def test_sync_trans_read_tensors_nonexist_node(): | |||
| tensor1 = np.array([32.0, 4096.0], np.float32) | |||
| name1 = "CudnnUniformReal.CudnnUniformReal-op391.0.0." | |||
| info1 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor2 = np.array([[0.0, 32.0, 4096.0], [4.5, 6.78, -11.0]], np.float32) | |||
| name2 = "ReluGradV2.ReluGradV2-op406.0.0." | |||
| info2 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=1, rank_id=0, root_graph_id=0, is_output=False) | |||
| # non-existing tensor with wrong op name | |||
| info3 = d.TensorInfo(node_name="Default/CudnnUniformReal-op390", | |||
| slot=0, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| # non-existing tensor with wrong iteration number | |||
| info4 = d.TensorInfo(node_name="Gradients/Default/network-WithLossCell/_backbone-AlexNet/gradReLU/ReluGradV2-op406", | |||
| slot=1, iteration=0, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor_name = [name1, name2] | |||
| tensor_create_info = [info1, info2] | |||
| tensor_list = [tensor1, tensor2] | |||
| temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_create_info) | |||
| tensor_check_info = [info3, info4] | |||
| debugger_backend = d.DbgServices(dump_file_path=temp_dir) | |||
| _ = debugger_backend.initialize( | |||
| net_name="alexnet", is_sync_mode=True) | |||
| tensor_data = debugger_backend.read_tensors(tensor_check_info) | |||
| # Check the length of tensor list | |||
| assert len(tensor_check_info) == 2 | |||
| assert len(tensor_data) == 2 | |||
| print_read_tensors(tensor_check_info, tensor_data) | |||
| shutil.rmtree(temp_dir) | |||
| if not GENERATE_GOLDEN: | |||
| assert compare_actual_with_expected(test_name) | |||
| def print_read_tensors(tensor_info, tensor_data): | |||
| """Print read tensors.""" | |||
| if GENERATE_GOLDEN: | |||
| f_write = open(test_name + ".expected", "w") | |||
| else: | |||
| f_write = open(test_name + ".actual", "w") | |||
| for x, _ in enumerate(tensor_info): | |||
| f_write.write( | |||
| "-----------------------------------------------------------\n") | |||
| f_write.write("tensor_info_" + str(x + 1) + " attributes:\n") | |||
| f_write.write("node name = " + tensor_info[x].node_name + "\n") | |||
| f_write.write("slot = " + str(tensor_info[x].slot) + "\n") | |||
| f_write.write("iteration = " + str(tensor_info[x].iteration) + "\n") | |||
| f_write.write("rank_id = " + str(tensor_info[x].rank_id) + "\n") | |||
| f_write.write("root_graph_id = " + | |||
| str(tensor_info[x].root_graph_id) + "\n") | |||
| f_write.write("is_output = " + | |||
| str(tensor_info[x].is_output) + "\n") | |||
| f_write.write("\n") | |||
| f_write.write("tensor_data_" + str(x + 1) + " attributes:\n") | |||
| f_write.write("data (printed in uint8) = " + str(np.frombuffer( | |||
| tensor_data[x].data_ptr, np.uint8, tensor_data[x].data_size)) + "\n") | |||
| py_byte_size = len(tensor_data[x].data_ptr) | |||
| c_byte_size = tensor_data[x].data_size | |||
| if c_byte_size != py_byte_size: | |||
| f_write.write("The python byte size of " + str(py_byte_size) + | |||
| " does not match the C++ byte size of " + str(c_byte_size) + "\n") | |||
| f_write.write("size in bytes = " + | |||
| str(tensor_data[x].data_size) + "\n") | |||
| f_write.write("debugger dtype = " + str(tensor_data[x].dtype) + "\n") | |||
| f_write.write("shape = " + str(tensor_data[x].shape) + "\n") | |||
| f_write.close() | |||
| if __name__ == "__main__": | |||
| test_sync_trans_read_tensors_nonexist_node() | |||
| @@ -1,168 +0,0 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Watchpoints test script for offline debugger APIs. | |||
| """ | |||
| import shutil | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import compare_actual_with_expected, build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| GENERATE_GOLDEN = False | |||
| test_name = "sync_watchpoints" | |||
| @security_off_wrap | |||
| def test_sync_trans_false_watchpoints(): | |||
| if GENERATE_GOLDEN: | |||
| f_write = open(test_name + ".expected", "w") | |||
| else: | |||
| f_write = open(test_name + ".actual", "w") | |||
| name1 = "Conv2D.Conv2D-op369.0.0." | |||
| tensor1 = np.array([[[-1.2808e-03, 7.7629e-03, 1.9241e-02], | |||
| [-1.3931e-02, 8.9359e-04, -1.1520e-02], | |||
| [-6.3248e-03, 1.8749e-03, 1.0132e-02]], | |||
| [[-2.5520e-03, -6.0005e-03, -5.1918e-03], | |||
| [-2.7866e-03, 2.5487e-04, 8.4782e-04], | |||
| [-4.6310e-03, -8.9111e-03, -8.1778e-05]], | |||
| [[1.3914e-03, 6.0844e-04, 1.0643e-03], | |||
| [-2.0966e-02, -1.2865e-03, -1.8692e-03], | |||
| [-1.6647e-02, 1.0233e-03, -4.1313e-03]]], np.float32) | |||
| info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", | |||
| slot=1, iteration=2, rank_id=0, root_graph_id=0, is_output=False) | |||
| name2 = "Parameter.fc2.bias.0.0." | |||
| tensor2 = np.array([-5.0167350e-06, 1.2509107e-05, -4.3148934e-06, 8.1415592e-06, | |||
| 2.1177532e-07, 2.9952851e-06], np.float32) | |||
| info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias", | |||
| slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor3 = np.array([2.9060817e-07, -5.1009415e-06, -2.8662325e-06, 2.6036503e-06, | |||
| -5.1546101e-07, 6.0798648e-06], np.float32) | |||
| info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc2.bias", | |||
| slot=0, iteration=3, rank_id=0, root_graph_id=0, is_output=True) | |||
| name3 = "Parameter.fc3.bias.0.0." | |||
| tensor4 = np.array([2.2930422e-04, -3.6369250e-04, 7.1337068e-04, -1.9567949e-05], np.float32) | |||
| info4 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/Parameter[6]_11/fc3.bias", | |||
| slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor_info = [info1, info2, info3, info4] | |||
| tensor_name = [name1, name2, name2, name3] | |||
| tensor_list = [tensor1, tensor2, tensor3, tensor4] | |||
| temp_dir = build_dump_structure(tensor_name, tensor_list, "alexnet", tensor_info) | |||
| debugger_backend = d.DbgServices(dump_file_path=temp_dir) | |||
| _ = debugger_backend.initialize(net_name="alexnet", is_sync_mode=True) | |||
| # NOTES: | |||
| # -> watch_condition=6 is MIN_LT | |||
| # -> watch_condition=18 is CHANGE_TOO_LARGE | |||
| # -> watch_condition=20 is NOT_CHANGE | |||
| # test 1: watchpoint set and hit (watch_condition=6) | |||
| param1 = d.Parameter(name="param", disabled=False, value=0.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/" | |||
| "Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param1]) | |||
| watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) | |||
| if len(watchpoint_hits_test_1) != 1: | |||
| f_write.write( | |||
| "ERROR -> test 1: watchpoint set but not hit just once\n") | |||
| print_watchpoint_hits(watchpoint_hits_test_1, 1, f_write) | |||
| # test 2: watchpoint remove and ensure it's not hit | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=1) | |||
| watchpoint_hits_test_2 = debugger_backend.check_watchpoints(iteration=2) | |||
| if watchpoint_hits_test_2: | |||
| f_write.write("ERROR -> test 2: watchpoint removed but hit\n") | |||
| # test 3: watchpoint set and not hit, then remove | |||
| param2 = d.Parameter(name="param", disabled=False, value=-1000.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/" | |||
| "Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param2]) | |||
| watchpoint_hits_test_3 = debugger_backend.check_watchpoints(iteration=2) | |||
| if watchpoint_hits_test_3: | |||
| f_write.write( | |||
| "ERROR -> test 3: watchpoint set but not supposed to be hit\n") | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=2) | |||
| # test 4: weight change watchpoint set and hit | |||
| param_abs_mean_update_ratio_gt = d.Parameter( | |||
| name="abs_mean_update_ratio_gt", disabled=False, value=0.0) | |||
| param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" | |||
| "Parameter[6]_11/fc2.bias": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": True | |||
| }}, parameter_list=[param_abs_mean_update_ratio_gt, | |||
| param_epsilon]) | |||
| watchpoint_hits_test_4 = debugger_backend.check_watchpoints(iteration=3) | |||
| if len(watchpoint_hits_test_4) != 1: | |||
| f_write.write("ERROR -> test 4: watchpoint weight change set but not hit just once\n") | |||
| print_watchpoint_hits(watchpoint_hits_test_4, 4, f_write) | |||
| f_write.close() | |||
| shutil.rmtree(temp_dir) | |||
| if not GENERATE_GOLDEN: | |||
| assert compare_actual_with_expected(test_name) | |||
| def print_watchpoint_hits(watchpoint_hits, test_id, f_write): | |||
| """Print watchpoint hits.""" | |||
| for x, _ in enumerate(watchpoint_hits): | |||
| f_write.write( | |||
| "-----------------------------------------------------------\n") | |||
| f_write.write("watchpoint_hit for test_%u attributes:" % | |||
| test_id + "\n") | |||
| f_write.write("name = " + watchpoint_hits[x].name + "\n") | |||
| f_write.write("slot = " + str(watchpoint_hits[x].slot) + "\n") | |||
| f_write.write("condition = " + | |||
| str(watchpoint_hits[x].condition) + "\n") | |||
| f_write.write("watchpoint_id = " + | |||
| str(watchpoint_hits[x].watchpoint_id) + "\n") | |||
| for p, _ in enumerate(watchpoint_hits[x].parameters): | |||
| f_write.write("parameter " + str(p) + " name = " + | |||
| watchpoint_hits[x].parameters[p].name + "\n") | |||
| f_write.write("parameter " + str(p) + " disabled = " + | |||
| str(watchpoint_hits[x].parameters[p].disabled) + "\n") | |||
| f_write.write("parameter " + str(p) + " value = " + | |||
| str(watchpoint_hits[x].parameters[p].value) + "\n") | |||
| f_write.write("parameter " + str(p) + " hit = " + | |||
| str(watchpoint_hits[x].parameters[p].hit) + "\n") | |||
| f_write.write("parameter " + str(p) + " actual_value = " + | |||
| str(watchpoint_hits[x].parameters[p].actual_value) + "\n") | |||
| f_write.write("error code = " + | |||
| str(watchpoint_hits[x].error_code) + "\n") | |||
| f_write.write("rank_id = " + | |||
| str(watchpoint_hits[x].rank_id) + "\n") | |||
| f_write.write("root_graph_id = " + | |||
| str(watchpoint_hits[x].root_graph_id) + "\n") | |||
| if __name__ == "__main__": | |||
| test_sync_trans_false_watchpoints() | |||
| @@ -0,0 +1,238 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================== | |||
| """ | |||
| Watchpoints test script for offline debugger APIs. | |||
| """ | |||
| import os | |||
| import json | |||
| import shutil | |||
| import numpy as np | |||
| import mindspore.offline_debug.dbg_services as d | |||
| from dump_test_utils import build_dump_structure | |||
| from tests.security_utils import security_off_wrap | |||
| class TestOfflineWatchpoints: | |||
| """Test watchpoint for offline debugger.""" | |||
| GENERATE_GOLDEN = False | |||
| test_name = "watchpoints" | |||
| watchpoint_hits_json = [] | |||
| temp_dir = '' | |||
| @classmethod | |||
| def setup_class(cls): | |||
| """Init setup for offline watchpoints test""" | |||
| name1 = "Conv2D.Conv2D-op369.0.0.1" | |||
| tensor1 = np.array([[[-1.2808e-03, 7.7629e-03, 1.9241e-02], | |||
| [-1.3931e-02, 8.9359e-04, -1.1520e-02], | |||
| [-6.3248e-03, 1.8749e-03, 1.0132e-02]], | |||
| [[-2.5520e-03, -6.0005e-03, -5.1918e-03], | |||
| [-2.7866e-03, 2.5487e-04, 8.4782e-04], | |||
| [-4.6310e-03, -8.9111e-03, -8.1778e-05]], | |||
| [[1.3914e-03, 6.0844e-04, 1.0643e-03], | |||
| [-2.0966e-02, -1.2865e-03, -1.8692e-03], | |||
| [-1.6647e-02, 1.0233e-03, -4.1313e-03]]], np.float32) | |||
| info1 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/conv1-Conv2d/Conv2D-op369", | |||
| slot=1, iteration=2, rank_id=0, root_graph_id=0, is_output=False) | |||
| name2 = "Parameter.fc2.bias.0.0.2" | |||
| tensor2 = np.array([-5.0167350e-06, 1.2509107e-05, -4.3148934e-06, 8.1415592e-06, | |||
| 2.1177532e-07, 2.9952851e-06], np.float32) | |||
| info2 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" | |||
| "Parameter[6]_11/fc2.bias", | |||
| slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=True) | |||
| tensor3 = np.array([2.9060817e-07, -5.1009415e-06, -2.8662325e-06, 2.6036503e-06, | |||
| -5.1546101e-07, 6.0798648e-06], np.float32) | |||
| info3 = d.TensorInfo(node_name="Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" | |||
| "Parameter[6]_11/fc2.bias", | |||
| slot=0, iteration=3, rank_id=0, root_graph_id=0, is_output=True) | |||
| name3 = "CudnnUniformReal.CudnnUniformReal-op391.0.0.3" | |||
| tensor4 = np.array([-32.0, -4096.0], np.float32) | |||
| info4 = d.TensorInfo(node_name="Default/CudnnUniformReal-op391", | |||
| slot=0, iteration=2, rank_id=0, root_graph_id=0, is_output=False) | |||
| tensor_info = [info1, info2, info3, info4] | |||
| tensor_name = [name1, name2, name2, name3] | |||
| tensor_list = [tensor1, tensor2, tensor3, tensor4] | |||
| cls.temp_dir = build_dump_structure(tensor_name, tensor_list, "Test", tensor_info) | |||
| @classmethod | |||
| def teardown_class(cls): | |||
| shutil.rmtree(cls.temp_dir) | |||
| @security_off_wrap | |||
| def test_sync_add_remove_watchpoints_hit(self): | |||
| # NOTES: watch_condition=6 is MIN_LT | |||
| # watchpoint set and hit (watch_condition=6), then remove it | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True) | |||
| param = d.Parameter(name="param", disabled=False, value=0.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet" | |||
| "/conv1-Conv2d/Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param]) | |||
| # add second watchpoint to check the watchpoint hit in correct order | |||
| param1 = d.Parameter(name="param", disabled=False, value=10.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, | |||
| check_node_list={"Default/CudnnUniformReal-op391": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param1]) | |||
| watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2) | |||
| assert len(watchpoint_hits_test) == 2 | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_watchpoint_hits(watchpoint_hits_test, 0, False) | |||
| else: | |||
| self.compare_expect_actual_result(watchpoint_hits_test, 0) | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=1) | |||
| watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) | |||
| assert len(watchpoint_hits_test_1) == 1 | |||
| @security_off_wrap | |||
| def test_sync_add_remove_watchpoints_not_hit(self): | |||
| # watchpoint set and not hit(watch_condition=6), then remove | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True) | |||
| param = d.Parameter(name="param", disabled=False, value=-1000.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet" | |||
| "/conv1-Conv2d/Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param]) | |||
| watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2) | |||
| assert not watchpoint_hits_test | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=2) | |||
| @security_off_wrap | |||
| def test_sync_weight_change_watchpoints_hit(self): | |||
| # NOTES: watch_condition=18 is CHANGE_TOO_LARGE | |||
| # weight change watchpoint set and hit(watch_condition=18) | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=True) | |||
| param_abs_mean_update_ratio_gt = d.Parameter( | |||
| name="abs_mean_update_ratio_gt", disabled=False, value=0.0) | |||
| param_epsilon = d.Parameter(name="epsilon", disabled=True, value=0.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=3, watch_condition=18, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet/fc3-Dense/" | |||
| "Parameter[6]_11/fc2.bias": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": True | |||
| }}, parameter_list=[param_abs_mean_update_ratio_gt, | |||
| param_epsilon]) | |||
| watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=3) | |||
| assert len(watchpoint_hits_test) == 1 | |||
| if self.GENERATE_GOLDEN: | |||
| self.print_watchpoint_hits(watchpoint_hits_test, 2, True) | |||
| else: | |||
| self.compare_expect_actual_result(watchpoint_hits_test, 2) | |||
| @security_off_wrap | |||
| def test_async_add_remove_watchpoint_hit(self): | |||
| # watchpoint set and hit(watch_condition=6) in async mode, then remove | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False) | |||
| param = d.Parameter(name="param", disabled=False, value=0.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=1, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet" | |||
| "/conv1-Conv2d/Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param]) | |||
| watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2) | |||
| assert len(watchpoint_hits_test) == 1 | |||
| if not self.GENERATE_GOLDEN: | |||
| self.compare_expect_actual_result(watchpoint_hits_test, 0) | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=1) | |||
| watchpoint_hits_test_1 = debugger_backend.check_watchpoints(iteration=2) | |||
| assert not watchpoint_hits_test_1 | |||
| @security_off_wrap | |||
| def test_async_add_remove_watchpoints_not_hit(self): | |||
| # watchpoint set and not hit(watch_condition=6) in async mode, then remove | |||
| debugger_backend = d.DbgServices(dump_file_path=self.temp_dir) | |||
| _ = debugger_backend.initialize(net_name="Test", is_sync_mode=False) | |||
| param = d.Parameter(name="param", disabled=False, value=-1000.0) | |||
| _ = debugger_backend.add_watchpoint(watchpoint_id=2, watch_condition=6, | |||
| check_node_list={"Default/network-WithLossCell/_backbone-AlexNet" | |||
| "/conv1-Conv2d/Conv2D-op369": | |||
| {"rank_id": [0], "root_graph_id": [0], "is_output": False | |||
| }}, parameter_list=[param]) | |||
| watchpoint_hits_test = debugger_backend.check_watchpoints(iteration=2) | |||
| assert not watchpoint_hits_test | |||
| _ = debugger_backend.remove_watchpoint(watchpoint_id=2) | |||
| def compare_expect_actual_result(self, watchpoint_hits_list, test_index): | |||
| """Compare actual result with golden file.""" | |||
| golden_file = os.path.realpath(os.path.join("../data/dump/gpu_dumps/golden/", | |||
| self.test_name + "_expected.json")) | |||
| with open(golden_file) as f: | |||
| expected_list = json.load(f) | |||
| for x, watchpoint_hits in enumerate(watchpoint_hits_list): | |||
| test_id = "watchpoint_hit" + str(test_index+x+1) | |||
| info = expected_list[x+test_index][test_id] | |||
| assert watchpoint_hits.name == info['name'] | |||
| assert watchpoint_hits.slot == info['slot'] | |||
| assert watchpoint_hits.condition == info['condition'] | |||
| assert watchpoint_hits.watchpoint_id == info['watchpoint_id'] | |||
| assert watchpoint_hits.error_code == info['error_code'] | |||
| assert watchpoint_hits.rank_id == info['rank_id'] | |||
| assert watchpoint_hits.root_graph_id == info['root_graph_id'] | |||
| for p, _ in enumerate(watchpoint_hits.parameters): | |||
| parameter = "parameter" + str(p) | |||
| assert watchpoint_hits.parameters[p].name == info['paremeter'][p][parameter]['name'] | |||
| assert watchpoint_hits.parameters[p].disabled == info['paremeter'][p][parameter]['disabled'] | |||
| assert watchpoint_hits.parameters[p].value == info['paremeter'][p][parameter]['value'] | |||
| assert watchpoint_hits.parameters[p].hit == info['paremeter'][p][parameter]['hit'] | |||
| assert watchpoint_hits.parameters[p].actual_value == info['paremeter'][p][parameter]['actual_value'] | |||
| def print_watchpoint_hits(self, watchpoint_hits_list, test_index, is_print): | |||
| """Print watchpoint hits.""" | |||
| for x, watchpoint_hits in enumerate(watchpoint_hits_list): | |||
| parameter_json = [] | |||
| for p, _ in enumerate(watchpoint_hits.parameters): | |||
| parameter = "parameter" + str(p) | |||
| parameter_json.append({ | |||
| parameter: { | |||
| 'name': watchpoint_hits.parameters[p].name, | |||
| 'disabled': watchpoint_hits.parameters[p].disabled, | |||
| 'value': watchpoint_hits.parameters[p].value, | |||
| 'hit': watchpoint_hits.parameters[p].hit, | |||
| 'actual_value': watchpoint_hits.parameters[p].actual_value | |||
| } | |||
| }) | |||
| watchpoint_hit = "watchpoint_hit" + str(test_index+x+1) | |||
| self.watchpoint_hits_json.append({ | |||
| watchpoint_hit: { | |||
| 'name': watchpoint_hits.name, | |||
| 'slot': watchpoint_hits.slot, | |||
| 'condition': watchpoint_hits.condition, | |||
| 'watchpoint_id': watchpoint_hits.watchpoint_id, | |||
| 'paremeter': parameter_json, | |||
| 'error_code': watchpoint_hits.error_code, | |||
| 'rank_id': watchpoint_hits.rank_id, | |||
| 'root_graph_id': watchpoint_hits.root_graph_id | |||
| } | |||
| }) | |||
| if is_print: | |||
| with open(self.test_name + "_expected.json", "w") as dump_f: | |||
| json.dump(self.watchpoint_hits_json, dump_f, indent=4, separators=(',', ': ')) | |||