Browse Source

!27544 fixed st and landscape bugs

Merge pull request !27544 from Songyuanwei/test
tags/v1.6.0
i-robot Gitee 4 years ago
parent
commit
9b27295d1e
4 changed files with 203 additions and 58 deletions
  1. +177
    -29
      mindspore/python/mindspore/train/callback/_landscape.py
  2. +6
    -5
      mindspore/python/mindspore/train/callback/_summary_collector.py
  3. +18
    -20
      tests/st/summary/test_summary_collector.py
  4. +2
    -4
      tests/ut/python/train/test_landscape.py

+ 177
- 29
mindspore/python/mindspore/train/callback/_landscape.py View File

@@ -17,12 +17,13 @@ import os
import time
import json
import shutil
import numbers

from collections import defaultdict, namedtuple
from concurrent.futures import wait, ALL_COMPLETED, ProcessPoolExecutor

import numpy as np
from scipy import linalg as LA
from scipy import linalg, sparse

from mindspore import log as logger
from mindspore.common.tensor import Tensor
@@ -194,7 +195,7 @@ class SummaryLandscape:
... # Simple usage for collect landscape information:
... interval_1 = [1, 2, 3, 4, 5]
... summary_collector = SummaryCollector(summary_dir='./summary/lenet_interval_1',
... collect_specified_data={'collect_landscape':{"landscape_size": 30,
... collect_specified_data={'collect_landscape':{"landscape_size": 4,
... "unit": "step",
... "create_landscape":{"train":True,
... "result":False
@@ -218,15 +219,14 @@ class SummaryLandscape:
... summary_landscape = SummaryLandscape('./summary/lenet_interval_1')
... # parameters of collect_landscape can be modified or unchanged
... summary_landscape.gen_landscapes_with_multi_process(callback_fn,
... collect_landscape={"landscape_size": 40,
... collect_landscape={"landscape_size": 4,
... "create_landscape":{"train":True,
... "result":True
... },
... "num_samples": 2048,
... "intervals": [interval_1
... ]},
... device_ids=[0, 1],
... device_target="GPU")
... device_ids=[1])
"""
def __init__(self, summary_dir):
self._summary_dir = os.path.realpath(summary_dir)
@@ -250,7 +250,7 @@ class SummaryLandscape:
shutil.rmtree(self._ckpt_dir, ignore_errors=True)

def gen_landscapes_with_multi_process(self, callback_fn, collect_landscape=None,
device_ids=None, device_target='Ascend', output=None):
device_ids=None, output=None):
"""
Use the multi process to generate landscape.

@@ -286,18 +286,13 @@ class SummaryLandscape:
device_ids (List(int)): Specifies which devices are used to create loss landscape.
For example: [0, 1] refers to creating loss landscape with device 0 and device 1.
Default: None.
device_target (str): Specifies the type of computing device.
Default: Ascend. Optional: Ascend/GPU/CPU.
output (str): Specifies the path to save the loss landscape.
Default: None. The default save path is the same as the summary file.
"""

output_path = os.path.realpath(output) if output is not None else self._summary_dir
summary_record = SummaryRecord(output_path)
check_value_type('device_target', device_target, str)
self._check_device_ids(device_ids)
if device_target not in ["Ascend", "GPU", "CPU"]:
raise ValueError(f'Landscape device_target should be Ascend, GPU or CPU, but got {device_target}.')
if collect_landscape is not None:
self._check_collect_landscape_data(collect_landscape)
json_path = os.path.join(self._ckpt_dir, 'train_metadata.json')
@@ -319,15 +314,13 @@ class SummaryLandscape:
with open(json_path, 'w') as file:
json.dump(data, file)

for interval, landscape in self._list_landscapes(callback_fn=callback_fn,
device_ids=device_ids,
device_target=device_target):
for interval, landscape in self._list_landscapes(callback_fn=callback_fn, device_ids=device_ids):
summary_record.add_value(PluginEnum.LANDSCAPE.value, f'landscape_{str(interval)}', landscape)
summary_record.record(0)
summary_record.flush()
summary_record.close()

def _list_landscapes(self, callback_fn, device_ids=None, device_target='Ascend'):
def _list_landscapes(self, callback_fn, device_ids=None):
"""Create landscape with single device and list all landscape."""

json_path = os.path.join(self._ckpt_dir, 'train_metadata.json')
@@ -350,7 +343,7 @@ class SummaryLandscape:
if count > 1:
futures = []
for device_id in device_ids:
future = executor.submit(self._set_context, device_id, device_target)
future = executor.submit(self._set_context, device_id)
futures.append(future)
wait(futures, return_when=ALL_COMPLETED)

@@ -402,9 +395,9 @@ class SummaryLandscape:
logger.info("Total use time: %s s." % (round(time.time() - start, 6)))

@staticmethod
def _set_context(device_id, device_target):
def _set_context(device_id):
"""Set context."""
context.set_context(device_id=device_id, device_target=device_target)
context.set_context(device_id=device_id)
context.set_context(mode=context.GRAPH_MODE)

def _create_landscape_by_pca(self, epochs, proz, landscape_size, device_ids=None, callback_fn=None, executor=None):
@@ -423,8 +416,9 @@ class SummaryLandscape:
param_matrixs = np.vstack(param_matrixs)
param_matrixs = param_matrixs[:-1] - param_matrixs[-1]
# Only 2 are needed, as we have to reduce high dimensions into 2D.And we reserve one for loss value.
principal_components = self._compute_pca(param_matrixs.T)
v_ori, w_ori = -np.array(principal_components[:, 0]), -np.array(principal_components[:, -1])
pca = _PCA(n_comps=2)
principal_components = pca.compute(param_matrixs.T)
v_ori, w_ori = np.array(principal_components[:, 0]), np.array(principal_components[:, -1])
final_params = list(multi_parameters[-1])

# Reshape PCA directions(include dimensions of all parameters) into original shape of Model parameters
@@ -822,7 +816,7 @@ class SummaryLandscape:
f'but got the: {type(i)}.')
#device_id should be between 0 and 7.
if i < 0 or i > 7:
raise ValueError(f'Landscape device_ids value should be between 0 and 7,bu got {i}.')
raise ValueError(f'Landscape device_ids value should be between 0 and 7,but got {i}.')


def _check_collect_landscape_data(self, collect_landscape):
@@ -877,11 +871,165 @@ class SummaryLandscape:
check_value_type("num_samples", num_samples, int)
self._check_create_landscape(create_landscape)

def _compute_pca(self, x):
x -= np.mean(x, axis=0)
cov = np.cov(x, rowvar=False)
evals, evecs = LA.eigh(cov)
idx = np.argsort(evals)[::-1]
evecs = evecs[:, idx]
result = np.dot(x, evecs[:, :2])
return result

class _PCA:
r"""
The internal class for computing PCA vectors.

.. math::

u, s, vt = svd(x - mean(x)),
u_i = u_i * s_i,

where :math:`mean` is the mean operator, :math:`svd` is the singular value decomposition operator.
:math:`u_i` is line :math:`i` of the :math:`u`, :math:`s_i` is column :math:`i` of the :math:`s`,
:math:`i` ranges from :math:`0` to :math:`n\_comps`.

Args:
n_comps (int): Number of principal components needed.
"""
def __init__(self, n_comps):
self._n_comps = n_comps
self._random_status = None
self._iterated_power = "auto"
self._n_oversamples = 10

def compute(self, x):
"""Main method for computing principal components."""
n_components = self._n_comps
# small dimension (the shape is less than 500), and the full amount is calculated.
if max(x.shape) <= 500:
u, s, _ = self._fit_few(x)
# When dimension of x is much, truncated SVD is used for calculation.
elif 1 <= n_components < 0.8 * min(x.shape):
u, s, _ = self._fit_much(x, n_components)
# A case of n_components in (0, 1)
else:
u, s, _ = self._fit_few(x)

u = u[:, :self._n_comps]
u *= s[:self._n_comps]

return u

def _fit_few(self, x):
"""Compute principal components with full SVD on x, when dimension of x is few."""
mean_ = np.mean(x, axis=0)
x -= mean_
u, s, vt = linalg.svd(x, full_matrices=False)
u, vt = self._svd_turn(u, vt)

return u, s, vt

def _fit_much(self, x, n_components):
"""Compute principal components with truncated SVD on x, when dimension of x is much."""
random_state = self._check_random_status(self._random_status)
mean_ = np.mean(x, axis=0)
x -= mean_
u, s, vt = self._random_svd(x, n_components, n_oversamples=self._n_oversamples, random_state=random_state)
return u, s, vt

def _random_svd(self, m, n_components, n_oversamples=10, random_state="warn"):
"""Compute a truncated randomized SVD."""
n_random = n_components + n_oversamples
n_samples, n_features = m.shape
# Adjust 7 or 4 was found a good compromise for randomized SVD.
n_iter = 7 if n_components < 0.1 * min(m.shape) else 4
transpose = n_samples < n_features
if transpose:
m = m.T

q = self._random_range_finder(m, size=n_random, n_iter=n_iter, random_state=random_state)
# Project m to the low dimensional space using the basis vectors (q vector).
b = self._safe_dot(q.T, m)
# Compute the svd on this matrix (b matrix)
uhat, s, vt = linalg.svd(b, full_matrices=False)

del b
u = np.dot(q, uhat)

if not transpose:
u, vt = self._svd_turn(u, vt)
else:
u, vt = self._svd_turn(u, vt, u_decision=False)

if transpose:
return vt[:n_components, :].T, s[:n_components], u[:, :n_components].T

return u[:, :n_components], s[:n_components], vt[:n_components, :]

def _random_range_finder(self, a, size, n_iter, random_state=None):
"""Computes an orthonormal matrix whose range approximates the range of A."""
random_state = self._check_random_status(random_state)
# Generate normal random vectors.
q = random_state.normal(size=(a.shape[1], size))
if a.dtype.kind == "f":
# Ensure f32 is retained as f32
q = q.astype(a.dtype, copy=False)
if n_iter <= 2:
power_iteration_normalizer = "none"
else:
power_iteration_normalizer = "LU"
# use power iterations with q to further compute the top singular vectors of a in q
for _ in range(n_iter):
if power_iteration_normalizer == "none":
q = self._safe_dot(a, q)
q = self._safe_dot(a.T, q)
elif power_iteration_normalizer == "LU":
q, _ = linalg.lu(self._safe_dot(a, q), permute_l=True)
q, _ = linalg.lu(self._safe_dot(a.T, q), permute_l=True)
# The orthogonal basis is extracted by the linear projection of Q, and the range of a is sampled.
q, _ = linalg.qr(self._safe_dot(a, q), mode="economic")
return q

def _safe_dot(self, a, b):
"""Dot product that handle the matrix case correctly."""
if a.ndim > 2 or b.ndim > 2:
if sparse.issparse(b):
# Sparse is always 2 dimensional. Implies a is above 3 dimensional.
# [n, ..., o, p] @ [l, m] -> [n, ..., o, m]
a_2d = a.reshape(-1, a.shape[-1])
ret = a_2d @ b
ret = ret.reshape(*a.shape[:-1], b.shape[1])
elif sparse.issparse(a):
# Sparse is always 2 dimensional. Implies b is above 3 dimensional.
# [l, m] @ [n, ..., o, p, q] -> [l, n, ..., o, q]
b_ = np.rollaxis(b, -2)
b_2d = b_.reshape((b.shape[-2], -1))
ret = a @ b_2d
ret = ret.reshape(a.shape[0], *b_.shape[1:])
else:
ret = np.dot(a, b)

else:
ret = a @ b

return ret

def _svd_turn(self, u, v, u_decision=True):
"""Confirm correction to ensure deterministic output from SVD."""
if u_decision:
# rows of v, columns of u
max_cols = np.argmax(np.abs(u), axis=0)
signs = np.sign(u[max_cols, range(u.shape[1])])
v *= signs[:, np.newaxis]
u *= signs
else:
# rows of u, columns of v
max_rows = np.argmax(np.abs(v), axis=1)
signs = np.sign(v[range(v.shape[0]), max_rows])
v *= signs[:, np.newaxis]
u *= signs
return u, v

def _check_random_status(self, seed):
"""Transform seed into a np.random.RandomState instance."""
if isinstance(seed, np.random.RandomState):
return seed
if seed is None or seed is np.random:
return np.random.RandomState()
if isinstance(seed, numbers.Integral):
return np.random.RandomState(seed)
raise ValueError(
"%r cannot be used to seed a numpy.random.RandomState instance" % seed
)

+ 6
- 5
mindspore/python/mindspore/train/callback/_summary_collector.py View File

@@ -120,7 +120,8 @@ class SummaryCollector(Callback):
For example, if it is set to 128, the resolution of the landscape is 128 * 128.
The calculation time increases with the increase of resolution.
Default: 40. Optional values: between 3 and 256.
- unit (str): Specify the interval strength of the training process. Optional: epoch/step.
- unit (str): Specify the interval strength of the training process. Default: "step".
Optional: epoch/step.
- create_landscape (dict): Select how to create loss landscape.
Training process loss landscape(train) and training result loss landscape(result).
Default: {"train": True, "result": True}. Optional: True/False.
@@ -372,11 +373,11 @@ class SummaryCollector(Callback):
f'but got the: {landscape_size}')

@staticmethod
def _check_unit(unit):
def _check_unit(unit):
"""Check unit type and value."""
check_value_type('unit', unit, str)
if "step" not in unit and "epoch" not in unit:
raise ValueError(f'Unit should be step or epoch, but got the: {unit}')
if unit not in ["step", "epoch"]:
raise ValueError(f'Unit should be "step" or "epoch", but got the: {unit}')

@staticmethod
def _check_create_landscape(create_landscape):
@@ -595,7 +596,7 @@ class SummaryCollector(Callback):
json.dump(data, file)
os.chmod(meta_path, stat.S_IRUSR)
except OSError as e:
logger.error(str(e))
logger.error("Write meta data %s failed, detail: %s" % (meta_path, str(e)))

def _save_model_params(self, cur_num, unit, backbone):
"""Save model params."""


+ 18
- 20
tests/st/summary/test_summary_collector.py View File

@@ -34,7 +34,6 @@ from tests.st.summary.dataset import create_mnist_dataset
from tests.summary_utils import SummaryReader
from tests.security_utils import security_off_wrap
set_seed(1)
def callback_fn():
"""A python function job"""
@@ -42,7 +41,7 @@ def callback_fn():
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
metrics = {"Loss": Loss()}
model = Model(network, loss, metrics=metrics)
ds_train = create_mnist_dataset("train")
ds_train = create_mnist_dataset("train", num_samples=6)
return model, network, ds_train, metrics
@@ -242,8 +241,8 @@ class TestSummary:
"""run network."""
lenet = LeNet5()
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9)
model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()})
optim = Momentum(lenet.trainable_params(), learning_rate=0.01, momentum=0.9)
model = Model(lenet, loss_fn=loss, optimizer=optim)
summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=2, **kwargs)
@@ -286,6 +285,8 @@ class TestSummary:
if re.search("_MS", file):
summary_file_path = os.path.join(summary_dir, file)
summary_list = summary_list + [summary_file_path]
else:
continue
assert summary_list
@@ -303,16 +304,15 @@ class TestSummary:
break
return tags
@pytest.mark.level1
@pytest.mark.platform_x86_ascend_training
@pytest.mark.platform_arm_ascend_training
@pytest.mark.level0
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@security_off_wrap
def test_summary_collector_landscape(self):
"""Test summary collector with landscape."""
set_seed(1)
interval_1 = [1, 2, 3]
num_samples = 2
num_samples = 6
summary_dir = self._train_network(epoch=3, num_samples=num_samples,
collect_specified_data={'collect_landscape':
{'landscape_size': 4,
@@ -324,19 +324,17 @@ class TestSummary:
tag_list = self._list_summary_collect_landscape_tags(summary_dir)
expected_tags = {'epoch_group', 'model_params_file_map', 'step_per_epoch', 'unit', 'num_samples',
'landscape_size', 'create_landscape', 'loss_map'}
'landscape_size', 'create_landscape'}
assert set(expected_tags) == set(tag_list)
device_target = context.get_context("device_target")
device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
summary_landscape = SummaryLandscape(summary_dir)
summary_landscape.gen_landscapes_with_multi_process(callback_fn, device_ids=[device_id],
device_target=device_target)
expected_pca_value = np.array([2.0876417, 2.0871262, 2.0866107, 2.0860953, 2.0871796, 2.0866641, 2.0861477,
2.0856318, 2.0867180, 2.0862016, 2.0856854, 2.0851683, 2.0862572, 2.0857398,
2.0852231, 2.0847058])
expected_random_value = np.array([2.0066809, 1.9905004, 1.9798302, 1.9742643, 2.0754160, 2.0571522, 2.0442397,
2.0365926, 2.1506545, 2.1299571, 2.1143755, 2.1042551, 2.2315959, 2.2083559,
2.1895625, 2.1762595])
summary_landscape.gen_landscapes_with_multi_process(callback_fn, device_ids=[device_id])
expected_pca_value = np.array([2.2795506, 2.2795567, 2.2795629, 2.2795689, 2.2795507, 2.2795567, 2.2795629,
2.2795688, 2.2795505, 2.2795566, 2.2795628, 2.2795689, 2.2795505, 2.2795566,
2.2795627, 2.2795687])
expected_random_value = np.array([2.2732414, 2.2778292, 2.2829423, 2.2885174, 2.2725525, 2.2772029, 2.2822288,
2.2875323, 2.2726187, 2.2771581, 2.2819989, 2.2875887, 2.2732263, 2.2774866,
2.2823269, 2.2883627])
tag_list_landscape = self._list_landscape_tags(summary_dir)
assert np.all(expected_pca_value - tag_list_landscape[0] < 1.e-3)
assert np.all(expected_random_value - tag_list_landscape[1] < 1.e-3)
assert np.all(abs(expected_pca_value - tag_list_landscape[0]) < 1.e-6)
assert np.all(abs(expected_random_value - tag_list_landscape[1]) < 1.e-6)

+ 2
- 4
tests/ut/python/train/test_landscape.py View File

@@ -19,7 +19,7 @@ import tempfile
import pytest

from mindspore.common import set_seed
from mindspore import nn, context
from mindspore import nn
from mindspore.nn.metrics import Loss
from mindspore.train import Model
from mindspore.train.callback import SummaryLandscape
@@ -83,7 +83,6 @@ class TestLandscape:
])
def test_params_gen_landscape_with_multi_process_value_type_error(self, collect_landscape):
"""Test the value of gen_landscape_with_multi_process param."""
device_target = context.get_context("device_target")
device_id = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0
summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir)
summary_landscape = SummaryLandscape(summary_dir)
@@ -91,8 +90,7 @@ class TestLandscape:
summary_landscape.gen_landscapes_with_multi_process(
callback_fn,
collect_landscape=collect_landscape,
device_ids=[device_id],
device_target=device_target
device_ids=[device_id]
)
param_name = list(collect_landscape)[0]
param_value = collect_landscape[param_name]


Loading…
Cancel
Save