Browse Source

Add gil for offline debugger

tags/v1.6.0
maning202007 4 years ago
parent
commit
a88926b6d9
5 changed files with 24 additions and 22 deletions
  1. +10
    -7
      mindspore/ccsrc/debug/debug_services.cc
  2. +5
    -5
      mindspore/ccsrc/debug/debugger/debugger.cc
  3. +3
    -3
      mindspore/ccsrc/debug/debugger/debugger.h
  4. +3
    -4
      mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc
  5. +3
    -3
      mindspore/python/mindspore/train/callback/_summary_collector.py

+ 10
- 7
mindspore/ccsrc/debug/debug_services.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -742,12 +742,15 @@ void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<
if (!files_to_convert_in_dir.empty()) {
// Look for the installation path to the conver_async package. If not found, throw exception and terminate the
// later task.
try {
auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
(void)convert_obj.attr("convert_files")();
} catch (pybind11::error_already_set &e) {
MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
{
pybind11::gil_scoped_acquire acquire;
try {
auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
(void)convert_obj.attr("convert_files")();
} catch (pybind11::error_already_set &e) {
MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
}
}
ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
}


+ 5
- 5
mindspore/ccsrc/debug/debugger/debugger.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -621,7 +621,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
}
if (debugger_enabled_) {
LoadParametersAndConst();
// get graph proto and send to Mindinsight
// get graph proto and send to MindInsight
auto graph_proto = graph_proto_list_.front();
SendGraphAndSuspend(graph_proto);
graph_proto_list_.clear();
@@ -700,7 +700,7 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
if (!CheckSendMetadata()) {
return;
}
// send graph to Mindinsight server
// send graph to MindInsight server
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
@@ -874,11 +874,11 @@ void Debugger::CommandLoop() {
ProcessKViewCMD(reply);
break;
case DebuggerCommand::kVersionMatchedCMD:
MS_LOG(ERROR) << "Received unexpected Version Matched CMD from Mindinsight.";
MS_LOG(ERROR) << "Received unexpected Version Matched CMD from MindInsight.";
Exit();
break;
default:
MS_LOG(ERROR) << "Received unknown CMD from Mindinsight";
MS_LOG(ERROR) << "Received unknown CMD from MindInsight";
Exit();
break;
}


+ 3
- 3
mindspore/ccsrc/debug/debugger/debugger.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -134,8 +134,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {

void SetTrainingDone(bool training_done);

// returns true if reply received and mindspore version matched with mindinsight version
// version_check should be true if you want the function to do backend compatibility check with Mindinsight
// returns true if reply received and mindspore version matched with mindInsight version
// version_check should be true if you want the function to do backend compatibility check with MindInsight
bool SendMetadata(bool version_check);

bool CheckSendMetadata();


+ 3
- 4
mindspore/ccsrc/debug/debugger/offline_debug/dbg_services.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -189,10 +189,9 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
std::vector<std::string> file_paths;

const bool init_dbg_suspend = (iteration == UINT_MAX);

tensor_list = debug_services_->ReadNeededDumpedTensors(iteration, &file_paths, error_on_no_value);
{
py::gil_scoped_release release;
pybind11::gil_scoped_release release;
tensor_list = debug_services_->ReadNeededDumpedTensors(iteration, &file_paths, error_on_no_value);
debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id,
error_on_no_value);


+ 3
- 3
mindspore/python/mindspore/train/callback/_summary_collector.py View File

@@ -1,4 +1,4 @@
# Copyright 2020-2021 Huawei Technologies Co., Ltd
# Copyright 2020-2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -102,9 +102,9 @@ class SummaryCollector(Callback):
- collect_train_lineage (bool): Whether to collect lineage data for the training phase,
this field will be displayed on the `lineage page \
<https://www.mindspore.cn/mindinsight/docs/en/master/lineage_and_scalars_comparison.html>`_
of Mindinsight. Default: True.
of MindInsight. Default: True.
- collect_eval_lineage (bool): Whether to collect lineage data for the evaluation phase,
this field will be displayed on the lineage page of Mindinsight. Default: True.
this field will be displayed on the lineage page of MindInsight. Default: True.
- collect_input_data (bool): Whether to collect dataset for each training.
Currently only image data is supported.
If there are multiple columns of data in the dataset, the first column should be image data.


Loading…
Cancel
Save