offline debug api design

4 years ago · 992e805c56
--- a/mindspore/offline_debug/debugger_tensor.py
+++ b/mindspore/offline_debug/debugger_tensor.py
@@ -0,0 +1,92 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """DebuggerTensor."""
 from abc import ABC


 class DebuggerTensor(ABC):
    """
    The tensor with specific rank, iteration and debugging info.

    Note:
        - Users should not instantiate this class manually.
        - The instances of this class is immutable.
        - A DebuggerTensor is always the output tensor of a node.
    """
    @property
    def node(self):
        """
        Get the node that outputs this tensor.

        Returns:
            Node, the node that outputs this tensor.
        """
        return None

    @property
    def name(self):
        """
        Get the name of this tensor.

        The name is composed of full name of a node and the slot number.

        Returns:
            str, the name of this tensor.
        """
        return ""

    @property
    def slot(self):
        """
        Get slot.

        Returns:
            int, the slot of the tensor on the node.
        """
        return -1

    @property
    def iteration(self):
        """
        Get the iteration for this tensor.

        Returns:
            int, the iteration for this tensor.
        """
        return -1

    @property
    def rank(self):
        """
        Get the rank for this tensor.

        Returns:
            int, the rank for this tensor.

        """
        return -1

    def get_value(self):
        """
        Get the value of the tensor.

        Returns:
            numpy.ndarray, the value of the debugger tensor.
        """

    def get_affected_nodes(self):
        """
        Get the nodes that use current tensor as input.
        """
--- a/mindspore/offline_debug/dump_analyzer.py
+++ b/mindspore/offline_debug/dump_analyzer.py
@@ -0,0 +1,138 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Debugger python API."""

 from typing import Iterable

 from mindspore.offline_debug.debugger_tensor import DebuggerTensor
 from mindspore.offline_debug.node import Node
 from mindspore.offline_debug.watchpoints import WatchpointBase, WatchpointHit


 class DumpAnalyzer:
    """
    Analyzer to inspect the dump data.

    Args:
        summary_dir (str): The path of the summary directory which contains
            dump folder.
        mem_limit (int, optional): The memory limit for this debugger session in
            MB. Default: None, which means no limit.
    """

    def __init__(self, summary_dir, mem_limit=None):
        self._summary_dir = summary_dir
        self._mem_limit = mem_limit

    def export_graphs(self, output_dir=None):
        """
        Export the computational graph(s) in xlsx file(s) to the output_dir.

        The file(s) will contain the stack info of graph nodes.

        Args:
            output_dir (str, optional): Output directory to save the file.
                Default: None, which means to use the current working directory.

        Returns:
            str. The path of the generated file.
        """

    def select_nodes(
            self,
            query_string,
            use_regex=False,
            match_target="name",
            case_sensitive=True) -> Iterable[Node]:
        """
        Select nodes.

        Args:
            query_string (str): Query string. For a node to be selected, the
                match target field must contains or matches the query string.
            use_regex (bool): Indicates whether query is a regex. Default: False.
            match_target (str, optional): The field to search when selecting
                nodes. Available values are "name", "stack".
                "name" means to search the name of the nodes in the
                graph. "stack" means the stack info of
                the node. Default: "name".
            case_sensitive (bool, optional): Whether case-sensitive when
                selecting tensors. Default: True.

        Returns:
            Iterable[Node], the matched nodes.
        """

    def select_tensors(
            self,
            query_string,
            use_regex=False,
            match_target="name",
            iterations=None,
            ranks=None,
            slots=None,
            case_sensitive=True) -> Iterable[DebuggerTensor]:
        """
        Select tensors.

        Args:
            query_string (str): Query string. For a tensor to be selected, the
                match target field must contains or matches the query string.
            use_regex (bool): Indicates whether query is a regex. Default: False.
            match_target (str, optional): The field to search when selecting
                tensors. Available values are "name", "stack".
                "name" means to search the name of the tensors in the
                graph. "name" is composed of graph node's full_name
                and the tensor's slot number. "stack" means the stack info of
                the node that outputs this tensor. Default: "name".
            iterations (list[int], optional): The iterations to select. Default:
                None, which means all iterations will be selected.
            ranks (list(int], optional): The ranks to select. Default: None,
                which means all ranks will be selected.
            slots (list[int], optional): The slot of the selected tensor.
                Default: None, which means all slots will be selected.
            case_sensitive (bool, optional): Whether case-sensitive when
                selecting tensors. Default: True.

        Returns:
          Iterable[DebuggerTensor], the matched tensors.
        """

    def get_iterations(self) -> Iterable[int]:
        """Get the available iterations this run."""

    def get_ranks(self) -> Iterable[int]:
        """Get the available ranks in this run."""

    def check_watchpoints(
            self,
            watchpoints: Iterable[WatchpointBase]) -> Iterable[WatchpointHit]:
        """
        Check the given watch points on specified nodes(if available) on the
        given iterations(if available) in a batch.

        Note:
            For speed, all watchpoints for the iteration should be given at
            the same time to avoid reading tensors len(watchpoints) times.

        Args:
            watchpoints (Iterable[WatchpointBase]): The list of watchpoints.

        Returns:
            Iterable[WatchpointHit], the watchpoint hist list is carefully
                sorted so that the user can see the most import hit on the
                top of the list. When there are many many watchpoint hits,
                we will display the list in a designed clear way.
        """
--- a/mindspore/offline_debug/node.py
+++ b/mindspore/offline_debug/node.py
@@ -0,0 +1,75 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Node in the computational graph."""
 from abc import ABC


 class Node(ABC):
    """Node in the computational graph."""
    @property
    def name(self):
        """
        Get the full name of this node.

        Returns:
            str, the full name of the node.
        """
        return ""

    @property
    def stack(self):
        """Get stack info."""
        return None

    def get_input_tensors(
            self,
            iterations=None,
            ranks=None,
            slots=None):
        """
        Get the input tensors of the node.

        Returns:
            Iterable[DebuggerTensor], the input tensors of the node.
        """

    def get_output_tensors(
            self,
            iterations=None,
            ranks=None,
            slots=None):
        """
        Get the output tensors of this node.

        Returns:
            Iterable[DebuggerTensor], the output tensors of the node.
        """

    def get_input_nodes(self):
        """
        Get the input nodes of this node.

        Returns:
            Iterable[Node], the input nodes of this node.

        """

    def get_output_nodes(self):
        """
        Get the nodes that use the output tensors of this node.

        Returns:
            Iterable[Node], the output nodes of this node.
        """
--- a/mindspore/offline_debug/watchpoints.py
+++ b/mindspore/offline_debug/watchpoints.py
@@ -0,0 +1,128 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Watchpoints."""
 from mindspore.offline_debug.debugger_tensor import DebuggerTensor


 class WatchpointBase:
    """
    Base class for watchpoints.

    Note:
        - The watchpoint is bounded with tensor names.
        - If multiple checking items is specified for one watch point instance,
          a tensor needs to trigger all of them to trigger the watchpoint.
    """
    @property
    def name(self):
        """Get the name for the watchpoint."""
        raise NotImplementedError

    def check(self):
        """
        Check the watchpoint against the tensors.

        Returns:
            list[WatchpointHit], the hits of the watchpoint.
        """


 class WatchpointHit:
    """
    Watchpoint hit.

    Note:
        - This class is not meant to be instantiated by user.
        - The instances of this class is immutable.

    Args:
        tensor (DebuggerTensor): The tensor which hits the watchpoint.
        watchpoint (WatchpointBase): The WatchPointBase object initialized with
            user setting value.
        watchpoint_hit_detail (WatchpointBase): The WatchPointBase object
            initialized with actual value of the Tensor.
        error_code: The code describing error.
    """

    def __init__(self,
                 tensor: DebuggerTensor,
                 watchpoint: WatchpointBase,
                 watchpoint_hit_detail: WatchpointBase,
                 error_code):
        self._tensor = tensor
        self._watchpoint = watchpoint
        self._error_code = error_code
        self._watchpoint_hit_detail = watchpoint_hit_detail

    def __str__(self):
        if self._error_code:
            return f"Watchpoint {self._watchpoint.name} check failed " \
                   f"on tensor {self._tensor.name}. " \
                   f"Error detail: error detail."

        return f"Watchpoint {self._watchpoint.name} triggered on " \
               f"tensor {self._tensor.name}. " \
               f"The setting for watchpoint is mean_gt=0.2, abs_mean_gt=0.3." \
               f"The actual value of the tensor is " \
               f"mean_gt=0.21, abs_mean_gt=0.35."

    @property
    def tensor(self) -> DebuggerTensor:
        """Get the tensor for this watchpoint hit."""
        return self._tensor

    def get_watchpoint(self):
        """Get the original watchpoint."""
        return self._watchpoint

    def get_hit_detail(self):
        """Get the actual values for the thresholds in the watchpoint."""
        return self._watchpoint_hit_detail


 class TensorTooLargeWatchpoint(WatchpointBase):
    """
    Tensor too large watchpoint.

    When all specified checking conditions were satisfied, this watchpoint would
    be hit after a check.

    Args:
        tensors (Iterable[DebuggerTensor]): The tensors to check.
        abs_mean_gt (float, optional): The threshold for mean of the absolute
            value of the tensor. When the actual value was greater than this
            threshold, this checking condition would be satisfied.
        max_gt (float, optional): The threshold for maximum of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.
        min_gt (float, optional): The threshold for minimum of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.
        mean_gt (float, optional): The threshold for mean of the tensor. When
            the actual value was greater than this threshold, this checking
            condition would be satisfied.
    """

    def __init__(self, tensors,
                 abs_mean_gt=None, max_gt=None, min_gt=None, mean_gt=None):
        self._tensors = tensors
        self._abs_mean_gt = abs_mean_gt
        self._max_gt = max_gt
        self._min_gt = min_gt
        self._mean_gt = mean_gt

    @property
    def name(self):
        return "TensorTooLarge"
--- a/tests/st/debugger/test_debug_api.py
+++ b/tests/st/debugger/test_debug_api.py
@@ -0,0 +1,80 @@
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Test debug API."""
 import pytest

 from mindspore.offline_debug.dump_analyzer import DumpAnalyzer
 from mindspore.offline_debug.watchpoints import TensorTooLargeWatchpoint


@pytest.mark.skip(reason="Feature under development.")
 def test_export_graphs():
    """Test debug API."""
    my_run = DumpAnalyzer(
        summary_dir="/path/to/summary-dir1"
    )

    # Export the info about computational graph. Should support multi graphs.
    my_run.export_graphs()


@pytest.mark.skip(reason="Feature under development.")
 def test_select_tensors():
    """Test debug API."""
    my_run = DumpAnalyzer(
        summary_dir="/path/to/summary-dir2"
    )

    # Find the interested tensors.
    matched_tensors = my_run.select_tensors(".*conv1.*", use_regex=True)
    assert matched_tensors == []


@pytest.mark.skip(reason="Feature under development.")
 def test_check_watchpoints_all_iterations():
    """Test debug API."""
    my_run = DumpAnalyzer(
        summary_dir="/path/to/summary-dir3"
    )

    # Checking all the iterations.
    watchpoints = [
        TensorTooLargeWatchpoint(
            tensors=my_run.select_tensors(
                "(*.weight^)|(*.bias^)", use_regex=True),
            abs_mean_gt=0.1)
    ]

    watch_point_hits = my_run.check_watchpoints(watchpoints=watchpoints)
    assert watch_point_hits == []


@pytest.mark.skip(reason="Feature under development.")
 def test_check_watchpoints_one_iteration():
    """Test debug API."""
    my_run = DumpAnalyzer(
        summary_dir="/path/to/summary-dir4"
    )
    # Checking specific iteration.
    watchpoints = [
        TensorTooLargeWatchpoint(
            tensors=my_run.select_tensors(
                "(*.weight^)|(*.bias^)", use_regex=True,
                iterations=[1]),
            abs_mean_gt=0.1)
    ]

    watch_point_hits = my_run.check_watchpoints(watchpoints=watchpoints)
    assert watch_point_hits == []