!27544 fixed st and landscape bugs

Merge pull request !27544 from Songyuanwei/test
4 years ago · 9b27295d1e
--- a/mindspore/python/mindspore/train/callback/_landscape.py
+++ b/mindspore/python/mindspore/train/callback/_landscape.py
@@ -17,12 +17,13 @@ import os
 import time
 import json
 import shutil
 import numbers

 from collections import defaultdict, namedtuple
 from concurrent.futures import wait, ALL_COMPLETED, ProcessPoolExecutor

 import numpy as np
 from scipy import linalg as LA
 from scipy import linalg, sparse

 from mindspore import log as logger
 from mindspore.common.tensor import Tensor
@@ -194,7 +195,7 @@ class SummaryLandscape:
        ...     # Simple usage for collect landscape information:
        ...     interval_1 = [1, 2, 3, 4, 5]
        ...     summary_collector = SummaryCollector(summary_dir='./summary/lenet_interval_1',
        ...                                          collect_specified_data={'collect_landscape':{"landscape_size": 30,
        ...                                          collect_specified_data={'collect_landscape':{"landscape_size": 4,
        ...                                                                                        "unit": "step",
        ...                                                                            "create_landscape":{"train":True,
        ...                                                                                               "result":False
@@ -218,15 +219,14 @@ class SummaryLandscape:
        ...     summary_landscape = SummaryLandscape('./summary/lenet_interval_1')
        ...     # parameters of collect_landscape can be modified or unchanged
        ...     summary_landscape.gen_landscapes_with_multi_process(callback_fn,
        ...                                                         collect_landscape={"landscape_size": 40,
        ...                                                         collect_landscape={"landscape_size": 4,
        ...                                                                            "create_landscape":{"train":True,
        ...                                                                                               "result":True
        ...                                                                                                           },
        ...                                                                            "num_samples": 2048,
        ...                                                                            "intervals": [interval_1
        ...                                                                                          ]},
        ...                                                         device_ids=[0, 1],
        ...                                                         device_target="GPU")
        ...                                                         device_ids=[1])
    """
    def __init__(self, summary_dir):
        self._summary_dir = os.path.realpath(summary_dir)
@@ -250,7 +250,7 @@ class SummaryLandscape:
        shutil.rmtree(self._ckpt_dir, ignore_errors=True)

    def gen_landscapes_with_multi_process(self, callback_fn, collect_landscape=None,
                                          device_ids=None, device_target='Ascend', output=None):
                                          device_ids=None, output=None):
        """
        Use the multi process to generate landscape.

@@ -286,18 +286,13 @@ class SummaryLandscape:
            device_ids (List(int)): Specifies which devices are used to create loss landscape.
                For example: [0, 1] refers to creating loss landscape with device 0 and device 1.
                Default: None.
            device_target (str): Specifies the type of computing device.
                Default: Ascend. Optional: Ascend/GPU/CPU.
            output (str): Specifies the path to save the loss landscape.
                Default: None. The default save path is the same as the summary file.
        """

        output_path = os.path.realpath(output) if output is not None else self._summary_dir
        summary_record = SummaryRecord(output_path)
        check_value_type('device_target', device_target, str)
        self._check_device_ids(device_ids)
        if device_target not in ["Ascend", "GPU", "CPU"]:
            raise ValueError(f'Landscape device_target should be Ascend, GPU or CPU, but got {device_target}.')
        if collect_landscape is not None:
            self._check_collect_landscape_data(collect_landscape)
            json_path = os.path.join(self._ckpt_dir, 'train_metadata.json')
@@ -319,15 +314,13 @@ class SummaryLandscape:
            with open(json_path, 'w') as file:
                json.dump(data, file)

        for interval, landscape in self._list_landscapes(callback_fn=callback_fn,
                                                         device_ids=device_ids,
                                                         device_target=device_target):
        for interval, landscape in self._list_landscapes(callback_fn=callback_fn, device_ids=device_ids):
            summary_record.add_value(PluginEnum.LANDSCAPE.value, f'landscape_{str(interval)}', landscape)
            summary_record.record(0)
            summary_record.flush()
        summary_record.close()

    def _list_landscapes(self, callback_fn, device_ids=None, device_target='Ascend'):
    def _list_landscapes(self, callback_fn, device_ids=None):
        """Create landscape with single device and list all landscape."""

        json_path = os.path.join(self._ckpt_dir, 'train_metadata.json')
@@ -350,7 +343,7 @@ class SummaryLandscape:
            if count > 1:
                futures = []
                for device_id in device_ids:
                    future = executor.submit(self._set_context, device_id, device_target)
                    future = executor.submit(self._set_context, device_id)
                    futures.append(future)
                wait(futures, return_when=ALL_COMPLETED)

@@ -402,9 +395,9 @@ class SummaryLandscape:
        logger.info("Total use time: %s s." % (round(time.time() - start, 6)))

    @staticmethod
    def _set_context(device_id, device_target):
    def _set_context(device_id):
        """Set context."""
        context.set_context(device_id=device_id, device_target=device_target)
        context.set_context(device_id=device_id)
        context.set_context(mode=context.GRAPH_MODE)

    def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
@@ -423,8 +416,9 @@ class SummaryLandscape:
        param_matrixs = np.vstack(param_matrixs)
        param_matrixs = param_matrixs[:-1] - param_matrixs[-1]
        # Only 2 are needed, as we have to reduce high dimensions into 2D.And we reserve one for loss value.
        principal_components = self._compute_pca(param_matrixs.T)
        v_ori, w_ori = -np.array(principal_components[:, 0]), -np.array(principal_components[:, -1])
        pca = _PCA(n_comps=2)
        principal_components = pca.compute(param_matrixs.T)
        v_ori, w_ori = np.array(principal_components[:, 0]), np.array(principal_components[:, -1])
        final_params = list(multi_parameters[-1])

        # Reshape PCA directions(include dimensions of all parameters) into original shape of Model parameters
@@ -822,7 +816,7 @@ class SummaryLandscape:
                                f'but got the: {type(i)}.')
            #device_id should be between 0 and 7.
            if i < 0 or i > 7:
                raise ValueError(f'Landscape device_ids value should be between 0 and 7,bu got {i}.')
                raise ValueError(f'Landscape device_ids value should be between 0 and 7,but got {i}.')


    def _check_collect_landscape_data(self, collect_landscape):
@@ -877,11 +871,165 @@ class SummaryLandscape:
        check_value_type("num_samples", num_samples, int)
        self._check_create_landscape(create_landscape)

    def _compute_pca(self, x):
        x -= np.mean(x, axis=0)
        cov = np.cov(x, rowvar=False)
        evals, evecs = LA.eigh(cov)
        idx = np.argsort(evals)[::-1]
        evecs = evecs[:, idx]
        result = np.dot(x, evecs[:, :2])
        return result

 class _PCA:
    r"""
    The internal class for computing PCA vectors.

    .. math::

        u, s, vt = svd(x - mean(x)),
        u_i = u_i * s_i,

    where :math:`mean` is the mean operator, :math:`svd` is the singular value decomposition operator.
    :math:`u_i` is line :math:`i` of the :math:`u`, :math:`s_i` is column :math:`i` of the :math:`s`,
    :math:`i` ranges from :math:`0` to :math:`n\_comps`.

    Args:
        n_comps (int): Number of principal components needed.
    """
    def __init__(self, n_comps):
        self._n_comps = n_comps
        self._random_status = None
        self._iterated_power = "auto"
        self._n_oversamples = 10

    def compute(self, x):
        """Main method for computing principal components."""
        n_components = self._n_comps
        # small dimension (the shape is less than 500), and the full amount is calculated.
        if max(x.shape) <= 500:
            u, s, _ = self._fit_few(x)
        # When dimension of x is much, truncated SVD is used for calculation.
        elif 1 <= n_components < 0.8 * min(x.shape):
            u, s, _ = self._fit_much(x, n_components)
        #  A case of n_components in (0, 1)
        else:
            u, s, _ = self._fit_few(x)

        u = u[:, :self._n_comps]
        u *= s[:self._n_comps]

        return u

    def _fit_few(self, x):
        """Compute principal components with full SVD on x, when dimension of x is few."""
        mean_ = np.mean(x, axis=0)
        x -= mean_
        u, s, vt = linalg.svd(x, full_matrices=False)
        u, vt = self._svd_turn(u, vt)

        return u, s, vt

    def _fit_much(self, x, n_components):
        """Compute principal components with truncated SVD on x, when dimension of x is much."""
        random_state = self._check_random_status(self._random_status)
        mean_ = np.mean(x, axis=0)
        x -= mean_
        u, s, vt = self._random_svd(x, n_components, n_oversamples=self._n_oversamples, random_state=random_state)
        return u, s, vt

    def _random_svd(self, m, n_components, n_oversamples=10, random_state="warn"):
        """Compute a truncated randomized SVD."""
        n_random = n_components + n_oversamples
        n_samples, n_features = m.shape
        # Adjust 7 or 4 was found a good compromise for randomized SVD.
        n_iter = 7 if n_components < 0.1 * min(m.shape) else 4
        transpose = n_samples < n_features
        if transpose:
            m = m.T

        q = self._random_range_finder(m, size=n_random, n_iter=n_iter, random_state=random_state)
        # Project m to the low dimensional space using the basis vectors (q vector).
        b = self._safe_dot(q.T, m)
        # Compute the svd on this matrix (b matrix)
        uhat, s, vt = linalg.svd(b, full_matrices=False)

        del b
        u = np.dot(q, uhat)

        if not transpose:
            u, vt = self._svd_turn(u, vt)
        else:
            u, vt = self._svd_turn(u, vt, u_decision=False)

        if transpose:
            return vt[:n_components, :].T, s[:n_components], u[:, :n_components].T

        return u[:, :n_components], s[:n_components], vt[:n_components, :]

    def _random_range_finder(self, a, size, n_iter, random_state=None):
        """Computes an orthonormal matrix whose range approximates the range of A."""
        random_state = self._check_random_status(random_state)
        # Generate normal random vectors.
        q = random_state.normal(size=(a.shape[1], size))
        if a.dtype.kind == "f":
            # Ensure f32 is retained as f32
            q = q.astype(a.dtype, copy=False)
        if n_iter <= 2:
            power_iteration_normalizer = "none"
        else:
            power_iteration_normalizer = "LU"
        # use power iterations with q to further compute the top singular vectors of a in q
        for _ in range(n_iter):
            if power_iteration_normalizer == "none":
                q = self._safe_dot(a, q)
                q = self._safe_dot(a.T, q)
            elif power_iteration_normalizer == "LU":
                q, _ = linalg.lu(self._safe_dot(a, q), permute_l=True)
                q, _ = linalg.lu(self._safe_dot(a.T, q), permute_l=True)
        # The orthogonal basis is extracted by the linear projection of Q, and the range of a is sampled.
        q, _ = linalg.qr(self._safe_dot(a, q), mode="economic")
        return q

    def _safe_dot(self, a, b):
        """Dot product that handle the matrix case correctly."""
        if a.ndim > 2 or b.ndim > 2:
            if sparse.issparse(b):
                # Sparse is always 2 dimensional. Implies a is above 3 dimensional.
                # [n, ..., o, p] @ [l, m] -> [n, ..., o, m]
                a_2d = a.reshape(-1, a.shape[-1])
                ret = a_2d @ b
                ret = ret.reshape(*a.shape[:-1], b.shape[1])
            elif sparse.issparse(a):
                # Sparse is always 2 dimensional. Implies b is above 3 dimensional.
                # [l, m] @ [n, ..., o, p, q] -> [l, n, ..., o, q]
                b_ = np.rollaxis(b, -2)
                b_2d = b_.reshape((b.shape[-2], -1))
                ret = a @ b_2d
                ret = ret.reshape(a.shape[0], *b_.shape[1:])
            else:
                ret = np.dot(a, b)

        else:
            ret = a @ b

        return ret

    def _svd_turn(self, u, v, u_decision=True):
        """Confirm correction to ensure deterministic output from SVD."""
        if u_decision:
            # rows of v, columns of u
            max_cols = np.argmax(np.abs(u), axis=0)
            signs = np.sign(u[max_cols, range(u.shape[1])])
            v *= signs[:, np.newaxis]
            u *= signs
        else:
            # rows of u, columns of v
            max_rows = np.argmax(np.abs(v), axis=1)
            signs = np.sign(v[range(v.shape[0]), max_rows])
            v *= signs[:, np.newaxis]
            u *= signs
        return u, v

    def _check_random_status(self, seed):
        """Transform seed into a np.random.RandomState instance."""
        if isinstance(seed, np.random.RandomState):
            return seed
        if seed is None or seed is np.random:
            return np.random.RandomState()
        if isinstance(seed, numbers.Integral):
            return np.random.RandomState(seed)
        raise ValueError(
            "%r cannot be used to seed a numpy.random.RandomState instance" % seed
        )
--- a/mindspore/python/mindspore/train/callback/_summary_collector.py
+++ b/mindspore/python/mindspore/train/callback/_summary_collector.py
@@ -120,7 +120,8 @@ class SummaryCollector(Callback):
                For example, if it is set to 128, the resolution of the landscape is 128 * 128.
                The calculation time increases with the increase of resolution.
                Default: 40. Optional values: between 3 and 256.
              - unit (str): Specify the interval strength of the training process. Optional: epoch/step.
              - unit (str): Specify the interval strength of the training process. Default: "step".
                Optional: epoch/step.
              - create_landscape (dict): Select how to create loss landscape.
                Training process loss landscape(train) and training result loss landscape(result).
                Default: {"train": True, "result": True}. Optional: True/False.
@@ -372,11 +373,11 @@ class SummaryCollector(Callback):
                             f'but got the: {landscape_size}')

    @staticmethod
    def _check_unit(unit):
    def  _check_unit(unit):
        """Check unit type and value."""
        check_value_type('unit', unit, str)
        if "step" not in unit and "epoch" not in unit:
            raise ValueError(f'Unit should be step or epoch, but got the: {unit}')
        if unit not in ["step", "epoch"]:
            raise ValueError(f'Unit should be "step" or "epoch", but got the: {unit}')

    @staticmethod
    def _check_create_landscape(create_landscape):
@@ -595,7 +596,7 @@ class SummaryCollector(Callback):
                json.dump(data, file)
            os.chmod(meta_path, stat.S_IRUSR)
        except OSError as e:
            logger.error(str(e))
            logger.error("Write meta data %s failed, detail: %s" % (meta_path, str(e)))

    def _save_model_params(self, cur_num, unit, backbone):
        """Save model params."""
--- a/tests/st/summary/test_summary_collector.py
+++ b/tests/st/summary/test_summary_collector.py
@@ -34,7 +34,6 @@ from tests.st.summary.dataset import create_mnist_dataset
 from tests.summary_utils import SummaryReader
 from tests.security_utils import security_off_wrap

 set_seed(1)

 def callback_fn():
    """A python function job"""
@@ -42,7 +41,7 @@ def callback_fn():
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    metrics = {"Loss": Loss()}
    model = Model(network, loss, metrics=metrics)
    ds_train = create_mnist_dataset("train")
    ds_train = create_mnist_dataset("train", num_samples=6)
    return model, network, ds_train, metrics


@@ -242,8 +241,8 @@ class TestSummary:
        """run network."""
        lenet = LeNet5()
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9)
        model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()})
        optim = Momentum(lenet.trainable_params(), learning_rate=0.01, momentum=0.9)
        model = Model(lenet, loss_fn=loss, optimizer=optim)
        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
        summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=2, **kwargs)

@@ -286,6 +285,8 @@ class TestSummary:
            if re.search("_MS", file):
                summary_file_path = os.path.join(summary_dir, file)
                summary_list = summary_list + [summary_file_path]
            else:
                continue

        assert summary_list

@@ -303,16 +304,15 @@ class TestSummary:
                            break
        return tags

    @pytest.mark.level1
    @pytest.mark.platform_x86_ascend_training
    @pytest.mark.platform_arm_ascend_training
    @pytest.mark.level0
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.env_onecard
    @security_off_wrap
    def test_summary_collector_landscape(self):
        """Test summary collector with landscape."""
        set_seed(1)
        interval_1 = [1, 2, 3]
        num_samples = 2
        num_samples = 6
        summary_dir = self._train_network(epoch=3, num_samples=num_samples,
                                          collect_specified_data={'collect_landscape':
                                                                      {'landscape_size': 4,
@@ -324,19 +324,17 @@ class TestSummary:

        tag_list = self._list_summary_collect_landscape_tags(summary_dir)
        expected_tags = {'epoch_group', 'model_params_file_map', 'step_per_epoch', 'unit', 'num_samples',
                         'landscape_size', 'create_landscape', 'loss_map'}
                         'landscape_size', 'create_landscape'}
        assert set(expected_tags) == set(tag_list)
        device_target = context.get_context("device_target")
        device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
        summary_landscape = SummaryLandscape(summary_dir)
        summary_landscape.gen_landscapes_with_multi_process(callback_fn, device_ids=[device_id],
                                                            device_target=device_target)
        expected_pca_value = np.array([2.0876417, 2.0871262, 2.0866107, 2.0860953, 2.0871796, 2.0866641, 2.0861477,
                                       2.0856318, 2.0867180, 2.0862016, 2.0856854, 2.0851683, 2.0862572, 2.0857398,
                                       2.0852231, 2.0847058])
        expected_random_value = np.array([2.0066809, 1.9905004, 1.9798302, 1.9742643, 2.0754160, 2.0571522, 2.0442397,
                                          2.0365926, 2.1506545, 2.1299571, 2.1143755, 2.1042551, 2.2315959, 2.2083559,
                                          2.1895625, 2.1762595])
        summary_landscape.gen_landscapes_with_multi_process(callback_fn, device_ids=[device_id])
        expected_pca_value = np.array([2.2795506, 2.2795567, 2.2795629, 2.2795689, 2.2795507, 2.2795567, 2.2795629,
                                       2.2795688, 2.2795505, 2.2795566, 2.2795628, 2.2795689, 2.2795505, 2.2795566,
                                       2.2795627, 2.2795687])
        expected_random_value = np.array([2.2732414, 2.2778292, 2.2829423, 2.2885174, 2.2725525, 2.2772029, 2.2822288,
                                          2.2875323, 2.2726187, 2.2771581, 2.2819989, 2.2875887, 2.2732263, 2.2774866,
                                          2.2823269, 2.2883627])
        tag_list_landscape = self._list_landscape_tags(summary_dir)
        assert np.all(expected_pca_value - tag_list_landscape[0] < 1.e-3)
        assert np.all(expected_random_value - tag_list_landscape[1] < 1.e-3)
        assert np.all(abs(expected_pca_value - tag_list_landscape[0]) < 1.e-6)
        assert np.all(abs(expected_random_value - tag_list_landscape[1]) < 1.e-6)
--- a/tests/ut/python/train/test_landscape.py
+++ b/tests/ut/python/train/test_landscape.py
@@ -19,7 +19,7 @@ import tempfile
 import pytest

 from mindspore.common import set_seed
 from mindspore import nn, context
 from mindspore import nn
 from mindspore.nn.metrics import Loss
 from mindspore.train import Model
 from mindspore.train.callback import SummaryLandscape
@@ -83,7 +83,6 @@ class TestLandscape:
    ])
    def test_params_gen_landscape_with_multi_process_value_type_error(self, collect_landscape):
        """Test the value of gen_landscape_with_multi_process param."""
        device_target = context.get_context("device_target")
        device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
        summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
        summary_landscape = SummaryLandscape(summary_dir)
@@ -91,8 +90,7 @@ class TestLandscape:
            summary_landscape.gen_landscapes_with_multi_process(
                callback_fn,
                collect_landscape=collect_landscape,
                device_ids=[device_id],
                device_target=device_target
                device_ids=[device_id]
            )
        param_name = list(collect_landscape)[0]
        param_value = collect_landscape[param_name]