| @@ -25,5 +25,14 @@ class Parameter(Tensor): | |||
| def __init__(self, value, *, dtype=None, device=None, requires_grad=True): | |||
| # pylint: disable=super-init-not-called | |||
| t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad) | |||
| if isinstance(value, Tensor): | |||
| t = value | |||
| else: | |||
| t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad) | |||
| self.__dict__.update(t.__dict__) | |||
| @property | |||
| def shape(self): | |||
| r"""Return shape of parameter. | |||
| """ | |||
| return self._symvar.imm_shape | |||
| @@ -16,3 +16,4 @@ from .linear import Linear | |||
| from .module import Module | |||
| from .pooling import AvgPool2d, MaxPool2d | |||
| from .sequential import Sequential | |||
| from .parampack import ParamPack | |||
| @@ -168,6 +168,29 @@ class Module(metaclass=ABCMeta): | |||
| """ | |||
| yield from self._flatten(predicate=_is_buffer, recursive=recursive) | |||
| def replace_param(self, | |||
| params: dict, | |||
| start_pos: int, | |||
| seen: Optional[Set[int]] = None): | |||
| offset = 0 | |||
| if seen is None: | |||
| seen = set([id(self)]) | |||
| module_dict = vars(self) | |||
| for key in sorted(module_dict): | |||
| hash_id = id(module_dict[key]) | |||
| if hash_id in seen: | |||
| continue | |||
| seen.add(hash_id) | |||
| if isinstance(module_dict[key], Parameter): | |||
| if start_pos + offset in params: | |||
| assert module_dict[key].shape == params[start_pos + | |||
| offset].shape | |||
| module_dict[key] = params[start_pos + offset] | |||
| offset += 1 | |||
| if isinstance(module_dict[key], Module): | |||
| offset += module_dict[key].replace_param(params, start_pos + offset, seen) | |||
| return offset | |||
| def named_buffers( | |||
| self, prefix: str = "", recursive: bool = True | |||
| ) -> Iterable[Tuple[str, Buffer]]: | |||
| @@ -0,0 +1,117 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| # | |||
| # Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, | |||
| # software distributed under the License is distributed on an | |||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| import collections | |||
| from typing import Iterable, Optional | |||
| import numpy as np | |||
| from ..core import Parameter, Tensor | |||
| from .module import Module | |||
| from .._internal.opr import param_pack_split | |||
| class ParamPack(Module): | |||
| def __init__(self, | |||
| model: Module, | |||
| nr_ignore_first:int = 8, | |||
| max_size_per_group: int = 10, | |||
| max_nr_params_per_group: int = 100): | |||
| super().__init__() | |||
| self._model = model | |||
| self._nr_ignore_first = nr_ignore_first | |||
| self._max_size_per_group = max_size_per_group | |||
| self._max_nr_params_per_group = max_nr_params_per_group | |||
| self._grouped_params = [] | |||
| self._packed_params = [] | |||
| params = model.parameters() | |||
| self._pack_params(params) | |||
| def parameters(self, requires_grad: Optional[bool] = None) -> Iterable[Parameter]: | |||
| for param in self._packed_params: | |||
| if requires_grad is None or param.requires_grad == requires_grad: | |||
| yield param | |||
| def _pack_params(self, params: Iterable[Parameter]): | |||
| groups = collections.defaultdict(list) | |||
| ignored = 0 | |||
| param_id = 0 | |||
| for param in params: | |||
| if self._nr_ignore_first > ignored: | |||
| ignored += 1 | |||
| self._grouped_params.append([{'tensor': param, 'id': param_id}]) | |||
| self._packed_params.append(param) | |||
| else: | |||
| key = (param.dtype, param.device, param.requires_grad) | |||
| groups[key].append({'tensor': param, 'id': param_id}) | |||
| param_id += 1 | |||
| for (dtype, device, requires_grad) in groups.keys(): | |||
| dtype_sz = np.dtype(dtype).itemsize | |||
| align = device.mem_align | |||
| if align < dtype_sz: | |||
| align = 1 | |||
| else: | |||
| assert align % dtype_sz == 0 | |||
| align //= dtype_sz | |||
| group = groups[(dtype, device, requires_grad)] | |||
| while group: | |||
| aligned_pos = [] | |||
| offset = 0 | |||
| params = [] | |||
| idx = 0 | |||
| while idx < len(group): | |||
| param = group[idx] | |||
| assert param['tensor'].device == device | |||
| padding = (align - (offset & (align - 1))) & (align - 1) | |||
| offset += padding | |||
| aligned_pos.append(offset) | |||
| params.append(param) | |||
| offset += int(np.prod(param['tensor'].shape)) | |||
| idx += 1 | |||
| if (offset * dtype_sz >= | |||
| self._max_size_per_group * 1024 * 1024 | |||
| or idx >= self._max_nr_params_per_group): | |||
| break | |||
| group = group[idx:] | |||
| if idx == 1: | |||
| # ignore param packs with only one item | |||
| self._packed_params.append(params[0]) | |||
| self._grouped_params.append(params) | |||
| continue | |||
| packed_value = np.zeros((offset, ), dtype=dtype) | |||
| for param, pos in zip(params, aligned_pos): | |||
| val = param['tensor'].numpy() | |||
| packed_value[pos:pos + val.size] = val.flatten() | |||
| new_param = Parameter(value=packed_value, | |||
| device=device, | |||
| dtype=dtype, | |||
| requires_grad=requires_grad) | |||
| self._packed_params.append(new_param) | |||
| self._grouped_params.append(params) | |||
| def forward(self, *args, **kwargs): | |||
| replace_param = dict() | |||
| for i in range(len(self._packed_params)): | |||
| packed_param = self._packed_params[i] | |||
| grouped_params = self._grouped_params[i] | |||
| if len(grouped_params) == 1: | |||
| continue | |||
| split = param_pack_split(packed_param._symvar, | |||
| [i['tensor'].shape for i in grouped_params]) | |||
| split = [ | |||
| Parameter(Tensor(i, requires_grad=packed_param.requires_grad)) | |||
| for i in split | |||
| ] | |||
| for j in range(len(split)): | |||
| replace_param[grouped_params[j]['id']] = split[j] | |||
| self._model.replace_param(replace_param, 0) | |||
| return self._model.forward(*args, **kwargs) | |||
| @@ -168,6 +168,8 @@ class Optimizer(metaclass=ABCMeta): | |||
| cg = get_default_graph() | |||
| grads = grad_func(loss, params, use_virtual_grad=not cg.is_eager()) | |||
| if not isinstance(grads, list): | |||
| grads = [grads] | |||
| assert len(grads) == len(params) | |||
| for param, grad in zip(params, grads): | |||
| @@ -0,0 +1,207 @@ | |||
| # -*- coding: utf-8 -*- | |||
| # MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
| # | |||
| # Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, | |||
| # software distributed under the License is distributed on an | |||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| import itertools | |||
| import numpy as np | |||
| import pytest | |||
| import megengine as mge | |||
| from megengine.core import tensor | |||
| from megengine.functional import cross_entropy_with_softmax, tanh | |||
| from megengine.jit import trace | |||
| from megengine.module import Linear, Module, ParamPack | |||
| from megengine.optimizer import SGD | |||
| batch_size = 64 | |||
| data_shape = (batch_size, 2) | |||
| label_shape = (batch_size,) | |||
| def minibatch_generator(): | |||
| while True: | |||
| inp_data = np.zeros((batch_size, 2)) | |||
| label = np.zeros(batch_size, dtype=np.int32) | |||
| for i in range(batch_size): | |||
| # [x0, x1], sampled from U[-1, 1] | |||
| inp_data[i, :] = np.random.rand(2) * 2 - 1 | |||
| label[i] = 0 if np.prod(inp_data[i]) < 0 else 1 | |||
| yield inp_data.astype(np.float32), label.astype(np.int32) | |||
| def calculate_precision(data: np.ndarray, pred: np.ndarray) -> float: | |||
| """ Calculate precision for given data and prediction. | |||
| :type data: [[x, y], ...] | |||
| :param data: Input data | |||
| :type pred: [[x_pred, y_pred], ...] | |||
| :param pred: Network output data | |||
| """ | |||
| correct = 0 | |||
| assert len(data) == len(pred) | |||
| for inp_data, pred_output in zip(data, pred): | |||
| label = 0 if np.prod(inp_data) < 0 else 1 | |||
| pred_label = np.argmax(pred_output) | |||
| if pred_label == label: | |||
| correct += 1 | |||
| return float(correct) / len(data) | |||
| class XORNet(Module): | |||
| def __init__(self): | |||
| self.mid_layers = 14 | |||
| self.num_class = 2 | |||
| super().__init__() | |||
| self.fc0 = Linear(self.num_class, self.mid_layers, bias=True) | |||
| self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True) | |||
| self.fc2 = Linear(self.mid_layers, self.num_class, bias=True) | |||
| def forward(self, x): | |||
| x = self.fc0(x) | |||
| x = tanh(x) | |||
| x = self.fc1(x) | |||
| x = tanh(x) | |||
| x = self.fc2(x) | |||
| return x | |||
| @pytest.mark.slow | |||
| def test_static_graph_parampack(): | |||
| net = XORNet() | |||
| net = ParamPack(net, | |||
| nr_ignore_first=0, | |||
| max_size_per_group=10, | |||
| max_nr_params_per_group=100) | |||
| opt = SGD( | |||
| net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
| ) | |||
| @trace(symbolic=True) | |||
| def train(data, label): | |||
| pred = net(data) | |||
| opt.zero_grad() | |||
| loss = cross_entropy_with_softmax(pred, label) | |||
| opt.backward(loss) | |||
| return loss | |||
| @trace(symbolic=True) | |||
| def infer(data): | |||
| return net(data) | |||
| train_dataset = minibatch_generator() | |||
| losses = [] | |||
| for data, label in itertools.islice(train_dataset, 2000): | |||
| loss = train(data, label) | |||
| loss = loss[0][0] | |||
| opt.step() | |||
| losses.append(loss.numpy()) | |||
| assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough" | |||
| data, _ = next(train_dataset) | |||
| pred = infer(data).numpy() | |||
| assert calculate_precision(data, pred) > 0.95, "Test precision must be high enough" | |||
| @pytest.mark.slow | |||
| def test_dynamic_graph_parampack(): | |||
| net = XORNet() | |||
| net = ParamPack(net, | |||
| nr_ignore_first=0, | |||
| max_size_per_group=10, | |||
| max_nr_params_per_group=100) | |||
| opt = SGD( | |||
| net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
| ) | |||
| @trace(symbolic=False) | |||
| def train(data, label): | |||
| pred = net(data) | |||
| opt.zero_grad() | |||
| loss = cross_entropy_with_softmax(pred, label) | |||
| opt.backward(loss) | |||
| return loss | |||
| @trace(symbolic=False) | |||
| def infer(data): | |||
| return net(data) | |||
| train_dataset = minibatch_generator() | |||
| losses = [] | |||
| for data, label in itertools.islice(train_dataset, 2000): | |||
| loss = train(data, label) | |||
| loss = loss[0][0] | |||
| opt.step() | |||
| losses.append(loss.numpy()) | |||
| assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough" | |||
| data, _ = next(train_dataset) | |||
| pred = infer(data).numpy() | |||
| assert calculate_precision(data, pred) > 0.95, "Test precision must be high enough" | |||
| @pytest.mark.slow | |||
| def test_correctness_parampack(): | |||
| net1 = XORNet() | |||
| net2 = XORNet() | |||
| params1 = net1.parameters() | |||
| params2 = net2.parameters() | |||
| for param1, param2 in zip(params1, params2): | |||
| param1.set_value(param2.numpy()) | |||
| net1 = ParamPack(net1, | |||
| nr_ignore_first=0, | |||
| max_size_per_group=10, | |||
| max_nr_params_per_group=100) | |||
| opt1 = SGD( | |||
| net1.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
| ) | |||
| opt2 = SGD( | |||
| net2.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
| ) | |||
| @trace(symbolic=False) | |||
| def train1(data, label): | |||
| pred = net1(data) | |||
| opt1.zero_grad() | |||
| loss = cross_entropy_with_softmax(pred, label) | |||
| opt1.backward(loss) | |||
| return loss | |||
| @trace(symbolic=False) | |||
| def train2(data, label): | |||
| pred = net2(data) | |||
| opt2.zero_grad() | |||
| loss = cross_entropy_with_softmax(pred, label) | |||
| opt2.backward(loss) | |||
| return loss | |||
| @trace(symbolic=False) | |||
| def infer1(data): | |||
| return net1(data) | |||
| @trace(symbolic=False) | |||
| def infer2(data): | |||
| return net2(data) | |||
| train_dataset = minibatch_generator() | |||
| for data, label in itertools.islice(train_dataset, 2000): | |||
| train1(data, label) | |||
| opt1.step() | |||
| train2(data, label) | |||
| opt2.step() | |||
| data, _ = next(train_dataset) | |||
| pred1 = infer1(data).numpy() | |||
| pred2 = infer2(data).numpy() | |||
| assert np.allclose(pred1, pred2) | |||