refactor(mge): polish api

- refactor(mge): add support for optimizer.step().clear_grad() idiom - refactor(mge): rename some methods of GradManager - refactor(mge): remove tensor_nn and TensorDict - refactor(mge): remove Buffer - refactor(mge): remove requires_grad flag - refactor(mge): add a default grad=None attribute to Tensor - refactor(mge): deprecation for 1.0 GitOrigin-RevId: 3b723d9387
5 years ago · 10e942d952
--- a/imperative/python/megengine/init.py
+++ b/imperative/python/megengine/init.py
@@ -74,8 +74,7 @@ from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .device import *
 from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
 from .serialization import load, save
 from .tensor import Tensor, tensor
 from .tensor_nn import Buffer, Parameter
 from .tensor import Parameter, Tensor, tensor
 from .version import __version__
 _set_fork_exec_path_for_timed_func(
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -22,7 +22,7 @@ class GradManager:
        self._after_backward_callback = []
        self._gradients = dict()
    def register(self, params, callbacks=None):
    def attach(self, params, callbacks=None):
        if callbacks is None:
            callbacks = []
        if isinstance(callbacks, Callable):
@@ -62,7 +62,7 @@ class GradManager:
                if isinstance(grad, Future):
                    grad = grad.get()
                param = self._param_dict[p]
                if getattr(param, "grad", None) is None:
                if param.grad is None:
                    param.grad = grad
                else:
                    param.grad += grad
@@ -70,9 +70,9 @@ class GradManager:
            self._stop_record()
            backwarding_grad_manager = cache
    def __enter__(self):
    def record(self):
        if self._recording:
            return self
            raise RuntimeError("already recording")
        grad = Grad()
        self._recording = True
        self._grad = grad
@@ -88,16 +88,22 @@ class GradManager:
            grad.wrt(param_wrapper, callback=callback)
        grad.__enter__()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
    def release(self):
        if not self._recording:
            raise RuntimeError("not recording")
        self._stop_record()
    record = __enter__
    def _stop_record(self):
        if self._grad is not None:
            self._grad.__exit__(None, None, None)
        self._recording = False
        self._grad = None
        self._gradients = dict()
    def __enter__(self):
        self.record()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self._stop_record()
--- a/imperative/python/megengine/core/ops/_internal/misc_ops.py
+++ b/imperative/python/megengine/core/ops/_internal/misc_ops.py
@@ -70,7 +70,7 @@ class Dimshuffle(PodOpVisitor):
            return bytes(ctypes.c_uint32(0)) + bytes(self)
    def __init__(self, pattern, ndim=0):
        assert isinstance(pattern, collections.Iterable)
        assert isinstance(pattern, collections.abc.Iterable)
        assert len(pattern) <= TensorShape.MAX_NDIM
        pattern_array = Dimshuffle.Pattern.Pattern_Array()
        for idx, v in enumerate(pattern):
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -231,13 +231,13 @@ class OpNode:
 def _wrap(x):
    if isinstance(x, collections.Sequence):
    if isinstance(x, collections.abc.Sequence):
        return type(x)(map(_wrap, x))
    return x.graph._wrap(x)
 def _unwrap(x):
    if isinstance(x, collections.Sequence):
    if isinstance(x, collections.abc.Sequence):
        return type(x)(map(_unwrap, x))
    return x._node
--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -166,7 +166,7 @@ def _reduce(mode):
            op = builtin.Reduce(mode=mode, axis=0)
            (result,) = apply(op, data)
        elif isinstance(axis, collections.Iterable):
        elif isinstance(axis, collections.abc.Iterable):
            axis = list(axis)
            axis.sort(reverse=True)
@@ -204,7 +204,9 @@ def _todo(*_):
 def _expand_args(args):
    if len(args) == 1:
        if isinstance(args[0], (collections.Sequence, TensorBase, TensorWrapperBase)):
        if isinstance(
            args[0], (collections.abc.Sequence, TensorBase, TensorWrapperBase)
        ):
            args = args[0]
    return args
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -143,7 +143,7 @@ def astensor1d(x, *reference, dtype=None, device=None):
            (x,) = Const(x, dtype=dtype, device=device)(*reference)
        return x
    if not isinstance(x, collections.Sequence):
    if not isinstance(x, collections.abc.Sequence):
        raise TypeError
    if any(isinstance(i, (TensorBase, TensorWrapperBase)) for i in x):
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -432,7 +432,7 @@ def argmin(
        [0]
    """
    if isinstance(axis, collections.Iterable):
    if isinstance(axis, collections.abc.Iterable):
        axis = list(axis)
        axis.sort(reverse=True)
@@ -486,7 +486,7 @@ def argmax(
        [5]
    """
    if isinstance(axis, collections.Iterable):
    if isinstance(axis, collections.abc.Iterable):
        axis = list(axis)
        axis.sort(reverse=True)
--- a/imperative/python/megengine/functional/types.py
+++ b/imperative/python/megengine/functional/types.py
@@ -15,7 +15,7 @@ def get_ndtuple(value, *, n, allow_zero=True):
    :type allow_zero: bool
    :param allow_zero: whether to allow zero tuple value"""
    if not isinstance(value, collections.Iterable):
    if not isinstance(value, collections.abc.Iterable):
        value = int(value)
        value = tuple([value for i in range(n)])
    else:
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -502,7 +502,7 @@ class trace:
            raise TypeError(
                "cannot specify output_names when output is already in dict format"
            )
        if output_names and not isinstance(output_names, collections.Sequence):
        if output_names and not isinstance(output_names, collections.abc.Sequence):
            output_names = (output_names,)
        if output_names and len(output_names) != len(self._output_bindings):
            raise ValueError(
@@ -510,7 +510,7 @@ class trace:
                    len(self._output_bindings)
                )
            )
        if arg_names and not isinstance(arg_names, collections.Sequence):
        if arg_names and not isinstance(arg_names, collections.abc.Sequence):
            arg_names = (arg_names,)
        if arg_names and len(arg_names) != len(self._arg_bindings):
            raise ValueError(
@@ -646,9 +646,9 @@ class trace:
    def _process_outputs(self, outputs):
        output_names = None
        if isinstance(outputs, collections.Mapping):
        if isinstance(outputs, collections.abc.Mapping):
            output_names, outputs = zip(*sorted(outputs.items()))
        elif not isinstance(outputs, collections.Sequence):
        elif not isinstance(outputs, collections.abc.Sequence):
            outputs = (outputs,)
        if not self._untraced:
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,7 +18,6 @@ from .embedding import Embedding
 from .identity import Identity
 from .linear import Linear
 from .module import Module
 from .parampack import ParamPack
 from .pooling import AvgPool2d, MaxPool2d
 from .quant_dequant import DequantStub, QuantStub
 from .sequential import Sequential
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -9,7 +9,7 @@
 import numpy as np
 from ..functional import leaky_relu, prelu, relu, sigmoid, softmax
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from .module import Module
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -12,7 +12,7 @@ import numpy as np
 from ..distributed.group import WORLD, Group
 from ..functional import batch_norm2d, sync_batch_norm
 from ..tensor_nn import Buffer, Parameter, Tensor
 from ..tensor import Parameter, Tensor
 from . import init
 from .module import Module
@@ -45,8 +45,8 @@ class _BatchNorm(Module):
        tshape = (1, self.num_features, 1, 1)
        if self.track_running_stats:
            self.running_mean = Buffer(np.zeros(tshape, dtype=np.float32))
            self.running_var = Buffer(np.ones(tshape, dtype=np.float32))
            self.running_mean = Tensor(np.zeros(tshape, dtype=np.float32))
            self.running_var = Tensor(np.ones(tshape, dtype=np.float32))
        else:
            self.running_mean = None
            self.running_var = None
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -13,7 +13,7 @@ import numpy as np
 from ..core.ops._internal import param_defs as P
 from ..functional import conv2d, conv_transpose2d, local_conv2d, relu
 from ..functional.types import _pair, _pair_nonzero
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from . import init
 from .module import Module
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -11,7 +11,7 @@ from typing import Optional
 import numpy as np
 from ..functional import embedding as embedding_func
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from . import init
 from .module import Module
@@ -72,6 +72,7 @@ class Embedding(Module):
        max_norm: Optional[float] = None,
        norm_type: Optional[float] = None,
        initial_weight: Parameter = None,
        freeze: bool = False,
    ):
        super().__init__()
        if padding_idx is not None:
@@ -83,6 +84,7 @@ class Embedding(Module):
        self.norm_type = norm_type
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.freeze = freeze
        if initial_weight is None:
            self.weight = Parameter(
                np.random.uniform(
@@ -101,7 +103,11 @@ class Embedding(Module):
        init.normal_(self.weight)
    def forward(self, inputs):
        return embedding_func(inputs, self.weight)
        if self.freeze:
            weight = self.weight.detach()
        else:
            weight = self.weight
        return embedding_func(inputs, weight)
    @classmethod
    def from_pretrained(
@@ -166,6 +172,6 @@ class Embedding(Module):
            padding_idx=padding_idx,
            max_norm=max_norm,
            norm_type=norm_type,
            freeze=freeze,
        )
        embedding.weight.requires_grad = not freeze
        return embedding
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -23,7 +23,7 @@ def fill_(tensor: Tensor, val: Union[float, int]) -> None:
    :param tensor: An n-dimentional tensor to be initialized
    :param val: The value to be filled throughout the tensor
    """
    tensor.set_value(full(shape=tensor.shape, value=val, dtype=tensor.dtype))
    tensor._reset(full(shape=tensor.shape, value=val, dtype=tensor.dtype))
 def zeros_(tensor: Tensor) -> None:
@@ -50,7 +50,7 @@ def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
    :param a: Lower bound of the sampling interval
    :param b: Upper bound of the sampling interval
    """
    tensor.set_value(uniform(tensor.shape, low=a, high=b).astype(tensor.dtype))
    tensor._reset(uniform(tensor.shape, low=a, high=b).astype(tensor.dtype))
 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
@@ -61,7 +61,7 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
    :param mean: The mean of the normal distribution
    :param std: The standard deviation of the normal distribution
    """
    tensor.set_value(gaussian(tensor.shape, mean=mean, std=std).astype(tensor.dtype))
    tensor._reset(gaussian(tensor.shape, mean=mean, std=std).astype(tensor.dtype))
 def calculate_gain(
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -8,7 +8,7 @@
 import numpy as np
 from ..functional import linear
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from . import init
 from .module import Module
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import warnings
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from typing import Any, Callable, Iterable, Optional, Set, Tuple, Union
@@ -14,8 +15,8 @@ import numpy as np
 from ..core.tensor.dtype import is_quantize
 from ..core.tensor.utils import make_shape_tuple
 from ..logger import get_logger
 from ..tensor import Tensor
 from ..tensor_nn import Buffer, Parameter
 from ..tensor import Parameter, Tensor
 from ..utils.deprecation import deprecated
 from ..utils.hook import HookHandler
 logger = get_logger(__name__)
@@ -48,7 +49,7 @@ def _is_parameter(obj):
 def _is_buffer(obj):
    return isinstance(obj, Buffer)
    return isinstance(obj, Tensor) and not isinstance(obj, Parameter)
 def _is_module(obj):
@@ -163,49 +164,43 @@ class Module(metaclass=ABCMeta):
                        seen=seen,
                    )
    def parameters(
        self, requires_grad: Optional[bool] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Parameter]:
    def parameters(self, recursive: bool = True, **kwargs) -> Iterable[Parameter]:
        r"""Returns an iterable for the :class:`~.Parameter` of the module.
        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
            attribute of returned :class:`.Parameter`. ``None`` for no limitation.
        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
            of this module.
        """
        if "requires_grad" in kwargs:
            del kwargs["requires_grad"]
            warnings.warn("passing requires_grad has no effect currently")
        def predicate(obj) -> bool:
            return _is_parameter(obj) and (
                requires_grad is None or obj.requires_grad == requires_grad
            )
            return _is_parameter(obj)
        yield from self._flatten(
            with_key=False, predicate=predicate, recursive=recursive, **kwargs
        )
    def named_parameters(
        self,
        requires_grad: Optional[bool] = None,
        prefix: Optional[str] = None,
        recursive: bool = True,
        **kwargs
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Parameter]]:
        """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
        ``key`` is the dotted path from this module to the :class:`~.Parameter` .
        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
            attribute of returned :class:`~.Parameter` . ``None`` for no limitation.
        :param prefix: The prefix prepended to the keys.
        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
            of this module.
        """
        if "requires_grad" in kwargs:
            del kwargs["requires_grad"]
            warnings.warn("passing requires_grad has no effect currently")
        def predicate(obj) -> bool:
            return _is_parameter(obj) and (
                requires_grad is None or obj.requires_grad == requires_grad
            )
            return _is_parameter(obj)
        yield from self._flatten(
            with_key=True,
@@ -215,11 +210,13 @@ class Module(metaclass=ABCMeta):
            **kwargs,
        )
    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Buffer]:
        """Returns an iterable for the :class:`~.Buffer` of the module.
    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Tensor]:
        """Returns an iterable for the buffers of the module.
        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
            module, else only returns :class:`~.Buffer` that are direct attributes
        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
        :param recursive: If ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        """
        yield from self._flatten(
@@ -228,13 +225,15 @@ class Module(metaclass=ABCMeta):
    def named_buffers(
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Buffer]]:
        """Returns an iterable for key :class:`~.Buffer` pairs of the module, where
        ``key`` is the dotted path from this module to the :class:`~.Buffer` .
    ) -> Iterable[Tuple[str, Tensor]]:
        """Returns an iterable for key buffer pairs of the module, where
        ``key`` is the dotted path from this module to the buffer.
        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
        :param prefix: The prefix prepended to the keys.
        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
            module, else only returns :class:`~.Buffer` that are direct attributes
        :param recursive: If ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        """
        yield from self._flatten(
@@ -297,6 +296,7 @@ class Module(metaclass=ABCMeta):
        for it in self.modules():
            fn(it)
    @deprecated(version="1.0")
    def zero_grad(self) -> None:
        """Set all parameters' grads to zero
        """
@@ -505,7 +505,7 @@ class Module(metaclass=ABCMeta):
            # scale/zero_points maybe invalid, use pretrained dtype instead.
            if is_quantize(to_be_load.dtype) and is_quantize(var.dtype):
                var = var.astype(to_be_load.dtype)
            var.set_value(to_be_load)
            var._reset(to_be_load)
            loaded.append(k)
        return set(loaded), set(skipped)
--- a/imperative/python/megengine/module/parampack.py
+++ b/imperative/python/megengine/module/parampack.py
@@ -1,156 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 from typing import Callable, Iterable, Optional, Tuple
 import numpy as np
 from ..tensor_nn import Parameter, Tensor
 from .module import Module
 class ParamPack(Module):
    r"""Pack module's parameters by gathering their memory to continuous address.
    Using (device, dtype, requires_grad) as key, for example ('gpu0', float32, True),
    parameters with same key will be packed togather.
    It helps a lot for multimachine training by speeding up allreduce gradients.
    :param model: the module you want to pack parameters.
    :param nr_ignore_first: how many parameters will be unpacked at first.
    :param max_size_per_group: upper bound of packed parameters' size in MB.
    :param max_nr_params_per_group: upper bound of the number of parameters of each group.
    """
    def __init__(
        self,
        model: Module,
        nr_ignore_first: int = 8,
        max_size_per_group: int = 10,
        max_nr_params_per_group: int = 100,
        group_func: Callable = lambda name, param: 0,
    ):
        super().__init__()
        self._model = model
        self._nr_ignore_first = nr_ignore_first
        self._max_size_per_group = max_size_per_group
        self._max_nr_params_per_group = max_nr_params_per_group
        self._group_func = group_func
        self._grouped_params = []
        self._packed_params = []
        params = model.named_parameters()
        self._pack_params(params)
    def parameters(self, requires_grad: Optional[bool] = None) -> Iterable[Parameter]:
        for param in self._packed_params:
            if requires_grad is None or param.requires_grad == requires_grad:
                yield param
    def named_parameters(
        self, requires_grad: Optional[bool] = None
    ) -> Iterable[Tuple[str, Parameter]]:
        for idx, param in enumerate(self._packed_params):
            if requires_grad is None or param.requires_grad == requires_grad:
                yield "packed_param_" + str(idx), param
    def _pack_params(self, params: Iterable[Tuple[str, Parameter]]):
        groups = collections.defaultdict(list)
        ignored = 0
        param_id = 0
        for name, param in params:
            if self._nr_ignore_first > ignored:
                ignored += 1
                self._grouped_params.append([{"shape": param.shape, "id": param_id}])
                param.pack_group_key = self._group_func(name, param)
                self._packed_params.append(param)
            else:
                key = (
                    param.dtype,
                    param.device,
                    param.requires_grad,
                    self._group_func(name, param),
                )
                groups[key].append({"tensor": param, "id": param_id})
            param_id += 1
        for (dtype, device, requires_grad, group_key) in groups.keys():
            dtype_sz = np.dtype(dtype).itemsize
            align = device.mem_align
            if align < dtype_sz:
                align = 1
            else:
                assert align % dtype_sz == 0
                align //= dtype_sz
            group = groups[(dtype, device, requires_grad, group_key)]
            while group:
                aligned_pos = []
                offset = 0
                params = []
                idx = 0
                while idx < len(group):
                    param = group[idx]
                    assert param["tensor"].device == device
                    padding = (align - (offset & (align - 1))) & (align - 1)
                    offset += padding
                    aligned_pos.append(offset)
                    params.append(param)
                    offset += int(np.prod(param["tensor"].shape))
                    idx += 1
                    if (
                        offset * dtype_sz >= self._max_size_per_group * 1024 * 1024
                        or idx >= self._max_nr_params_per_group
                    ):
                        break
                group = group[idx:]
                if idx == 1:
                    # ignore param packs with only one item
                    params[0]["tensor"].pack_group_key = group_key
                    self._packed_params.append(params[0]["tensor"])
                    self._grouped_params.append(
                        [{"shape": params[0]["tensor"].shape, "id": params[0]["id"]}]
                    )
                    continue
                packed_value = np.zeros((offset,), dtype=dtype)
                for param, pos in zip(params, aligned_pos):
                    val = param["tensor"].numpy()
                    packed_value[pos : pos + val.size] = val.flatten()
                new_param = Parameter(
                    value=packed_value,
                    device=device,
                    dtype=dtype,
                    requires_grad=requires_grad,
                )
                new_param.pack_group_key = group_key
                self._packed_params.append(new_param)
                self._grouped_params.append(
                    [{"shape": i["tensor"].shape, "id": i["id"]} for i in params]
                )
    def forward(self, *args, **kwargs):
        replace_param = dict()
        for i in range(len(self._packed_params)):
            packed_param = self._packed_params[i]
            grouped_params = self._grouped_params[i]
            if len(grouped_params) == 1:
                continue
            split = param_pack_split(
                packed_param._symvar, [i["shape"] for i in grouped_params]
            )
            split = [
                Parameter(Tensor(i, requires_grad=packed_param.requires_grad))
                for i in split
            ]
            for j in range(len(split)):
                replace_param[grouped_params[j]["id"]] = split[j]
        self._model.replace_param(replace_param, 0)
        return self._model.forward(*args, **kwargs)
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -12,7 +12,7 @@ import numpy as np
 from ... import module as Float
 from ...core.tensor import dtype
 from ...functional import conv_bias_activation
 from ...tensor_nn import Parameter
 from ...tensor import Parameter
 from ..qat import conv as QAT
 from .module import QuantizedModule
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -5,7 +5,7 @@
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from ...tensor_nn import Parameter
 from ...tensor import Parameter
 from ..qat import conv_bn as QAT
 from .conv import Conv2d
--- a/imperative/python/megengine/module/quantized/linear.py
+++ b/imperative/python/megengine/module/quantized/linear.py
@@ -9,7 +9,7 @@ import numpy as np
 from ... import functional as F
 from ...core.tensor import dtype
 from ...tensor_nn import Parameter
 from ...tensor import Parameter
 from ..qat import linear as QAT
 from .module import QuantizedModule
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -11,7 +11,7 @@ from typing import Iterable, Union
 import numpy as np
 from ..functional import sqrt
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from .optimizer import Optimizer
@@ -63,7 +63,7 @@ class Adadelta(Optimizer):
        for param in param_group["params"]:
            if not param.requires_grad or "grad" not in param.__dict__:
            if param.grad is None:
                continue
            states = self._state[param]
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -11,7 +11,7 @@ from typing import Iterable, Union
 import numpy as np
 from ..functional import sqrt
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from .optimizer import Optimizer
@@ -62,7 +62,7 @@ class Adagrad(Optimizer):
        for param in param_group["params"]:
            if not param.requires_grad or "grad" not in param.__dict__:
            if param.grad is None:
                continue
            states = self._state[param]
--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
@@ -8,7 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from typing import Iterable, Tuple, Union
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from .optimizer import Optimizer
@@ -59,7 +59,7 @@ class Adam(Optimizer):
        for param in param_group["params"]:
            if not param.requires_grad or "grad" not in param.__dict__:
            if param.grad is None:
                continue
            grad = param.grad
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -7,7 +7,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from abc import ABCMeta, abstractmethod
 from collections import Iterable
 from collections.abc import Iterable
 from contextlib import contextmanager
 from typing import Dict
 from typing import Iterable as Iter
@@ -15,8 +15,7 @@ from typing import Union
 import numpy as np
 from ..tensor import Tensor, TensorDict
 from ..tensor_nn import Buffer, Parameter
 from ..tensor import Parameter, Tensor
 class _RequiredParameter:
@@ -37,7 +36,7 @@ class Optimizer(metaclass=ABCMeta):
    def __init__(  # pylint: disable=too-many-branches
        self, params: Union[Iter[Parameter], dict], defaults: dict,
    ):
        self._state = TensorDict()
        self._state = dict()
        self._defaults = defaults
        if isinstance(params, (Parameter, dict)):
@@ -93,10 +92,6 @@ class Optimizer(metaclass=ABCMeta):
                    "optimizer can only optimize Parameters, but one of the params is "
                    + type(param)
                )
            if not param.requires_grad:
                raise ValueError(
                    "optimizer can only optimize Parameters with requires_grad=True"
                )
        for name, default in self._defaults.items():
            if default is required and name not in param_group:
@@ -122,7 +117,7 @@ class Optimizer(metaclass=ABCMeta):
            initializer = np.zeros(param.shape, dtype=np.float32)
        state_dict = self._state.setdefault(param, {})
        assert state_name not in state_dict
        state = Buffer(initializer)
        state = Tensor(initializer)
        state_dict[state_name] = state
    @abstractmethod
@@ -140,7 +135,7 @@ class Optimizer(metaclass=ABCMeta):
                params.append(param)
        return params
    def step(self, clear_grad=False):
    def step(self):
        r"""Performs a single optimization step.
        """
@@ -152,8 +147,7 @@ class Optimizer(metaclass=ABCMeta):
                    "Please use a list instead."
                )
            self._updates(group)
        if clear_grad:
            self.clear_grad()
        return self
    def clear_grad(self):
        r"""Clear the grad buffer.
@@ -161,8 +155,7 @@ class Optimizer(metaclass=ABCMeta):
        """
        for param_group in self.param_groups:
            for param in param_group["params"]:
                if getattr(param, "grad", None) is not None:
                    param.grad = None
                param.grad = None
    def state_dict(self) -> Dict:
        r"""Export the optimizer state.
@@ -171,7 +164,7 @@ class Optimizer(metaclass=ABCMeta):
        """
        param_groups = []
        state = dict()
        param2id = TensorDict()
        param2id = dict()
        cur_id = 0
        for group in self.param_groups:
@@ -213,8 +206,9 @@ class Optimizer(metaclass=ABCMeta):
                p = param_new
                self._state[p] = state["state"][param_saved].copy()
                for k, v in self._state[p].items():
                    if isinstance(v, Buffer):
                        self._state[p][k] = Buffer(v.numpy())
                    if isinstance(v, Tensor):
                        # TODO: maybe a more efficient way?
                        self._state[p][k] = Tensor(v.numpy())
            if set(group_new.keys()) != set(group_saved.keys()):
                raise ValueError(
--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
@@ -8,7 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from typing import Iterable, Union
 from ..tensor_nn import Parameter
 from ..tensor import Parameter
 from .optimizer import Optimizer
@@ -52,7 +52,7 @@ class SGD(Optimizer):
        momentum = param_group["momentum"]
        for param in param_group["params"]:
            if not param.requires_grad or "grad" not in param.__dict__:
            if param.grad is None:
                continue
            grad = param.grad
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
@@ -14,8 +14,7 @@ from .. import functional as F
 from ..core.tensor.dtype import _metadata_dict, get_quantized_dtype
 from ..core.tensor.function import Function
 from ..module import Module
 from ..tensor import Tensor
 from ..tensor_nn import Parameter
 from ..tensor import Parameter, Tensor
 from .utils import QuantMode, fake_quant_tensor, get_qparam_dict
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
@@ -13,7 +13,7 @@ import numpy as np
 from .. import functional as F
 from ..core.tensor.dtype import _metadata_dict, get_quantized_dtype
 from ..module import Module
 from ..tensor_nn import Buffer
 from ..tensor import Tensor
 from .utils import QuantMode, Round, get_qparam_dict
@@ -82,8 +82,8 @@ class MinMaxObserver(Observer):
    ):
        super().__init__(dtype, narrow_range)
        self.mode = mode
        self.min_val = Buffer(np.finfo(np.float32).max, dtype=np.float32)
        self.max_val = Buffer(np.finfo(np.float32).min, dtype=np.float32)
        self.min_val = Tensor(np.finfo(np.float32).max, dtype=np.float32)
        self.max_val = Tensor(np.finfo(np.float32).min, dtype=np.float32)
        self.scale_limit = eps
    def _calculate_qparams(self, inp_min_val, inp_max_val):
@@ -118,8 +118,8 @@ class MinMaxObserver(Observer):
            # stop gradient
            x = x_orig.detach()
            # find max and min
            self.min_val.set_value(F.minimum(self.min_val, x.min()))
            self.max_val.set_value(F.maximum(self.max_val, x.max()))
            self.min_val._reset(F.minimum(self.min_val, x.min()))
            self.max_val._reset(F.maximum(self.max_val, x.max()))
        return x_orig
@@ -133,22 +133,22 @@ class ExponentialMovingAverageObserver(MinMaxObserver):
        narrow_range: bool = False,
    ):
        super().__init__(mode, eps, dtype, narrow_range)
        self.momentum = Buffer(momentum)
        self.runtime_momentum = Buffer(0.0)
        self.momentum = Tensor(momentum)
        self.runtime_momentum = Tensor(0.0)
    def set_momentum(self, momentum):
        self.momentum.set_value(momentum)
        self.momentum._reset(momentum)
    def forward(self, x_orig):
        if self.enabled:
            # stop gradient
            x = x_orig.detach()
            # Exponential Moving Average
            self.min_val.set_value(
            self.min_val._reset(
                self.min_val * self.runtime_momentum
                + (1 - self.runtime_momentum) * x.min()
            )
            self.max_val.set_value(
            self.max_val._reset(
                self.max_val * self.runtime_momentum
                + (1 - self.runtime_momentum) * x.max()
            )
@@ -171,7 +171,7 @@ class HistogramObserver(MinMaxObserver):
        self.bins = bins
        self.upsample_rate = upsample_rate
        self.dst_nbins = _metadata_dict[dtype].qmax - _metadata_dict[dtype].qmin + 1
        self.histogram = Buffer([-1] + [0.0] * (bins - 1))
        self.histogram = Tensor([-1] + [0.0] * (bins - 1))
    def _non_linear_param_search(self):
        r"""Non-linear parameter search.
@@ -395,9 +395,9 @@ class HistogramObserver(MinMaxObserver):
                    self.bins,
                )
        self.histogram.set_value(new_histogram)
        self.min_val.set_value(new_min)
        self.max_val.set_value(new_max)
        self.histogram._reset(new_histogram)
        self.min_val._reset(new_min)
        self.max_val._reset(new_max)
    def forward(self, x_orig):
        self.sideeffect_forward(x_orig)
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
@@ -14,10 +14,11 @@ from .core import Tensor as _Tensor
 from .core.ops.builtin import Copy
 from .core.tensor.core import apply
 from .device import get_default_device
 from .utils.deprecation import deprecated
 class Tensor(_Tensor):
    requires_grad = False
    grad = None
    dmap_callback = None
    def __init__(self, data, dtype=None, device=None):
@@ -26,15 +27,32 @@ class Tensor(_Tensor):
        self.q_dict = {"mode": None, "scale": None, "zero_point": None}
        super().__init__(data, dtype=dtype, device=device)
    @deprecated(version="1.0", reason="no need to reuse an existing tensor since 1.0")
    def set_value(self, value):
        self._reset(value)
    @deprecated(version="1.0", reason="use *= 0 instead")
    def reset_zero(self):
        self *= 0
    def to(self, cn):
        return apply(Copy(comp_node=cn), self)[0]
    @property
    def requires_grad(self):
        raise AttributeError("requires_grad is reserved for future use")
    @requires_grad.setter
    def requires_grad(self, value):
        raise AttributeError("requires_grad is reserved for future use")
    @requires_grad.deleter
    def requires_grad(self):
        raise AttributeError("requires_grad is reserved for future use")
    def __hash__(self):
        return id(self)
    def __getstate__(self):
        r""" __getstate__ will be called for pickle serialization or deep copy
        """
@@ -73,53 +91,6 @@ class Tensor(_Tensor):
 tensor = Tensor
 class Dict(collections.MutableMapping):
    def __init__(self, *args, key=None, **kwargs):
        self.data = {}
        if key:
            self.keyfn = key
        for i in args:
            self.update(i)
        self.update(**kwargs)
    @staticmethod
    def keyfn(key):  # pylint: disable=method-hidden
        return key
    def __getitem__(self, key):
        _, v = self.data[self.keyfn(key)]
        return v
    def __setitem__(self, key, value):
        self.data[self.keyfn(key)] = key, value
    def __delitem__(self, key):
        del self.data[self.keyfn(key)]
    def __iter__(self):
        for _, (k, _) in self.data.items():
            yield k
    def __len__(self):
        return len(self.data)
 class TensorDict(Dict):  # pylint: disable=too-many-ancestors
    class keyfn:
        def __new__(cls, x: Tensor):
            if not isinstance(x, Tensor):
                return x
            return super().__new__(cls)
        def __init__(self, x: Tensor):
            self._data = x  # do not save id directly to make pickle work
        def __hash__(self):
            return id(self._data)
        def __eq__(self, other):
            # pylint: disable=undefined-variable
            return isinstance(other, __class__) and id(self._data) == id(other._data)
    def __init__(self, *args):
        super().__init__(*args)
 class Parameter(Tensor):
    r"""A kind of Tensor that is to be considered a module parameter.
    """
--- a/imperative/python/megengine/tensor_nn.py
+++ b/imperative/python/megengine/tensor_nn.py
@@ -1,20 +0,0 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from . import Tensor, tensor
 class Buffer(Tensor):
    r"""A kind of Tensor with ``requires_grad=False``.
    """
 class Parameter(Tensor):
    r"""A kind of Tensor that is to be considered a module parameter.
    """
    requires_grad = True
--- a/imperative/python/megengine/utils/deprecation.py
+++ b/imperative/python/megengine/utils/deprecation.py
@@ -0,0 +1 @@
 from deprecated.sphinx import deprecated
--- a/imperative/python/megengine/utils/types.py
+++ b/imperative/python/megengine/utils/types.py
@@ -15,7 +15,7 @@ def get_ndtuple(value, *, n, allow_zero=True):
    :type allow_zero: bool
    :param allow_zero: whether to allow zero tuple value"""
    if not isinstance(value, collections.Iterable):
    if not isinstance(value, collections.abc.Iterable):
        value = int(value)
        value = tuple([value for i in range(n)])
    else:
--- a/imperative/python/requires.txt
+++ b/imperative/python/requires.txt
@@ -5,3 +5,4 @@ requests
 tabulate
 tqdm
 redispy
 deprecated
--- a/imperative/python/test/integration/test_advance_indexing.py
+++ b/imperative/python/test/integration/test_advance_indexing.py
@@ -38,7 +38,7 @@ class Simple2(Module):
 def test_advance_indexing():
    net = Simple()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    optim.clear_grad()
@@ -48,7 +48,7 @@ def test_advance_indexing():
    data = tensor(raw_data)
    mask = tensor(raw_mask)
    answer = 1.0 - raw_data[raw_mask].sum()
    with gm.record():
    with gm:
        loss = net(data, mask).sum()
        gm.backward(loss)
    optim.step()
@@ -58,7 +58,7 @@ def test_advance_indexing():
 def test_advance_indexing_with_subtensor():
    net = Simple2()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    optim.clear_grad()
@@ -66,7 +66,7 @@ def test_advance_indexing_with_subtensor():
    raw_data = np.arange(576).reshape(dshape).astype(np.float32)
    data = tensor(raw_data)
    answer = 1.0 - raw_data[1, ..., :, 0:4:2, 0:2].sum()
    with gm.record():
    with gm:
        loss = net(data).sum()
        gm.backward(loss)
    optim.step()
--- a/imperative/python/test/integration/test_ai.py
+++ b/imperative/python/test/integration/test_ai.py
@@ -28,13 +28,13 @@ class Simple(Module):
 def test_ai():
    net = Simple()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    optim.clear_grad()
    dshape = (10, 10)
    data = tensor(np.ones(dshape).astype(np.float32))
    with gm.record():
    with gm:
        loss = net(data).sum()
        gm.backward(loss)
    optim.step()
--- a/imperative/python/test/integration/test_bn.py
+++ b/imperative/python/test/integration/test_bn.py
@@ -25,12 +25,12 @@ def test_frozen_bn():
    saved_wt = m.weight.numpy()
    saved_bias = m.bias.numpy()
    gm = ad.GradManager().register(m.parameters())
    gm = ad.GradManager().attach(m.parameters())
    optim = optimizer.SGD(m.parameters(), lr=1.0)
    optim.clear_grad()
    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
    with gm.record():
    with gm:
        loss = m(data).mean()
        gm.backward(loss)
    optim.step()
@@ -46,12 +46,12 @@ def test_bn_no_track_stat():
    nchannel = 3
    m = BatchNorm2d(nchannel, track_running_stats=False)
    gm = ad.GradManager().register(m.parameters())
    gm = ad.GradManager().attach(m.parameters())
    optim = optimizer.SGD(m.parameters(), lr=1.0)
    optim.clear_grad()
    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
    with gm.record():
    with gm:
        loss = m(data).sum()
        gm.backward(loss)
    optim.step()
@@ -68,12 +68,12 @@ def test_bn_no_track_stat2():
    saved_mean = m.running_mean.numpy()
    assert saved_mean is not None
    gm = ad.GradManager().register(m.parameters())
    gm = ad.GradManager().attach(m.parameters())
    optim = optimizer.SGD(m.parameters(), lr=1.0)
    optim.clear_grad()
    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
    with gm.record():
    with gm:
        loss = m(data).sum()
        gm.backward(loss)
    optim.step()
--- a/imperative/python/test/integration/test_converge.py
+++ b/imperative/python/test/integration/test_converge.py
@@ -74,13 +74,11 @@ class XORNet(Module):
 def test_training_converge():
    net = XORNet()
    opt = SGD(
        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
    )
    gm = ad.GradManager().register(net.parameters())
    opt = SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    gm = ad.GradManager().attach(net.parameters())
    def train(data, label):
        with gm.record():
        with gm:
            pred = net(data)
            loss = F.cross_entropy_with_softmax(pred, label)
            gm.backward(loss)
--- a/imperative/python/test/integration/test_correctness.py
+++ b/imperative/python/test/integration/test_correctness.py
@@ -91,7 +91,7 @@ class MnistNet(Module):
 def train(data, label, net, opt, gm):
    with gm.record():
    with gm:
        pred = net(data)
        loss = F.cross_entropy_with_softmax(pred, label)
        gm.backward(loss)
@@ -117,7 +117,7 @@ def update_model(model_path):
    net.load_state_dict(checkpoint["net_init"])
    lr = checkpoint["sgd_lr"]
    opt = SGD(net.parameters(), lr=lr)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    data = Tensor(checkpoint["data"], dtype=np.float32)
    label = Tensor(checkpoint["label"], dtype=np.int32)
@@ -152,7 +152,7 @@ def run_train(
    net.load_state_dict(checkpoint["net_init"])
    lr = checkpoint["sgd_lr"]
    opt = SGD(net.parameters(), lr=lr)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    data = Tensor(checkpoint["data"], dtype=np.float32)
    label = Tensor(checkpoint["label"], dtype=np.int32)
--- a/imperative/python/test/integration/test_detach.py
+++ b/imperative/python/test/integration/test_detach.py
@@ -32,11 +32,11 @@ def test_detach():
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    optim.clear_grad()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    dshape = (10, 10)
    data = tensor(np.ones(dshape).astype(np.float32))
    with gm.record():
    with gm:
        loss = net(data).sum()
        gm.backward(loss)
    optim.step()
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -97,7 +97,7 @@ class MnistNet(Module):
 def train(data, label, net, opt, gm):
    opt.clear_grad()
    with gm.record():
    with gm:
        pred = net(data)
        loss = F.cross_entropy_with_softmax(pred, label)
        gm.backward(loss)
@@ -125,8 +125,7 @@ def update_model(model_path):
    lr = checkpoint["sgd_lr"]
    opt = SGD(net.parameters(), lr=lr)
    gm = ad.GradManager()
    gm.register(
    gm = ad.GradManager().attach(
        net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]
    )
@@ -171,8 +170,7 @@ def run_test(
        lr = checkpoint["sgd_lr"]
        opt = SGD(net.parameters(), lr=lr)
        gm = ad.GradManager()
        gm.register(
        gm = ad.GradManager().attach(
            net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]
        )
--- a/imperative/python/test/integration/test_hello_world.py
+++ b/imperative/python/test/integration/test_hello_world.py
@@ -33,10 +33,10 @@ def test_hello_world():
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    optim.clear_grad()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    data = tensor([2.34])
    with gm.record():
    with gm:
        loss = net(data)
        gm.backward(loss)
    optim.step()
--- a/imperative/python/test/integration/test_optimizer.py
+++ b/imperative/python/test/integration/test_optimizer.py
@@ -13,7 +13,7 @@ import megengine.functional as F
 from megengine import Parameter, optimizer
 from megengine.jit import trace
 from megengine.module import Linear, Module
 from megengine.tensor import TensorDict, tensor
 from megengine.tensor import tensor
 class MLP(Module):
@@ -44,7 +44,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
    net = Simple()
    opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
    check_func = check_class(net, **test_case)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    step = 0
    data_shape = (2, 28)
@@ -57,12 +57,12 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
        data = tensor(np.random.random(data_shape).astype(np.float32))
        opt.clear_grad()
        with gm.record():
        with gm:
            pred = net(data)
            loss = pred.sum()
            gm.backward(loss)
        ori_params = TensorDict()
        ori_params = {}
        for param in net.parameters():
            ori_params[param] = np.copy(param.numpy())
        opt.step()
@@ -75,7 +75,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
        @trace(symbolic=symbolic)
        def train_func(data, *, opt=None, gm=None):
            opt.clear_grad()
            with gm.record():
            with gm:
                pred = net(data)
                loss = pred.sum()
                gm.backward(loss)
@@ -84,7 +84,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
        # reset net and opt
        net = Simple()
        opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
        gm = ad.GradManager().register(net.parameters())
        gm = ad.GradManager().attach(net.parameters())
        check_func = check_class(net, **test_case)
        step = 0
        for i in range(iter_num):
@@ -93,7 +93,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
                    group["lr"] += 0.01
                check_func.lr += 0.01
            ori_params = TensorDict()
            ori_params = {}
            for param in net.parameters():
                ori_params[param] = np.copy(param.numpy())
@@ -105,7 +105,7 @@ def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
 def test_sgd():
    class CheckValue:
        def __init__(self, net, **kwarg):
            self.slots = TensorDict()
            self.slots = {}
            for param in net.parameters():
                self.slots[param] = np.zeros(param.shape).astype(np.float32)
            for k, v in kwarg.items():
@@ -134,8 +134,8 @@ def test_sgd():
 def test_adam():
    class CheckValue:
        def __init__(self, net, **kwarg):
            self.m_slots = TensorDict()
            self.v_slots = TensorDict()
            self.m_slots = {}
            self.v_slots = {}
            for param in net.parameters():
                self.m_slots[param] = np.zeros(param.shape).astype(np.float32)
                self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
@@ -175,7 +175,7 @@ def test_adam():
 def test_adagrad():
    class CheckValue:
        def __init__(self, net, **kwarg):
            self.s_slots = TensorDict()
            self.s_slots = {}
            for param in net.parameters():
                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
            for k, v in kwarg.items():
@@ -207,8 +207,8 @@ def test_adagrad():
 def test_adadelta():
    class CheckValue:
        def __init__(self, net, **kwarg):
            self.s_slots = TensorDict()
            self.a_slots = TensorDict()
            self.s_slots = {}
            self.a_slots = {}
            for param in net.parameters():
                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
                self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
--- a/imperative/python/test/integration/test_save_load.py
+++ b/imperative/python/test/integration/test_save_load.py
@@ -23,11 +23,11 @@ def test_save_load():
    optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
    optim.clear_grad()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    data = tensor([2.34])
    with gm.record():
    with gm:
        loss = net(data)
        gm.backward(loss)
@@ -55,7 +55,7 @@ def test_save_load():
    optim.load_state_dict(checkpoint["opt_state"])
    print("load done")
    with gm.record():
    with gm:
        loss = net([1.23])
        gm.backward(loss)
--- a/imperative/python/test/integration/test_sgd_momentum.py
+++ b/imperative/python/test/integration/test_sgd_momentum.py
@@ -31,12 +31,12 @@ def test_sgd_momentum():
    optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
    optim.clear_grad()
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    data = tensor([2.34])
    # do a step of train
    with gm.record():
    with gm:
        loss = net(data)
        gm.backward(loss)
    optim.step()
@@ -51,7 +51,7 @@ def test_sgd_momentum():
    # do a step of train
    optim.clear_grad()
    with gm.record():
    with gm:
        loss = net(data)
        gm.backward(loss)
    optim.step()
@@ -69,7 +69,7 @@ def test_sgd_momentum_trace():
        @trace(symbolic=symbolic)
        def train_func(data, *, model=None, optim=None, gm=None):
            optim.clear_grad()
            with gm.record():
            with gm:
                loss = net(data)
                gm.backward(loss)
            optim.step()
@@ -82,7 +82,7 @@ def test_sgd_momentum_trace():
        net = Simple()
        optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
        gm = ad.GradManager().register(net.parameters())
        gm = ad.GradManager().attach(net.parameters())
        data = tensor([2.34])
        train_func(data, model=net, optim=optim, gm=gm)
        np.testing.assert_almost_equal(
--- a/imperative/python/test/integration/test_trace_dump.py
+++ b/imperative/python/test/integration/test_trace_dump.py
@@ -61,15 +61,15 @@ class XORNet(M.Module):
 def test_xornet_trace_dump():
    net = XORNet()
    opt = optim.SGD(net.parameters(requires_grad=True), lr=0.01, momentum=0.9)
    gm = GradManager().register(net.parameters(requires_grad=True))
    opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    gm = GradManager().attach(net.parameters())
    batch_size = 64
    train_dataset = minibatch_generator(batch_size)
    val_dataset = minibatch_generator(batch_size)
    @trace
    def train_fun(data, label):
        with gm.record():
        with gm:
            net.train()
            pred = net(data)
            loss = F.cross_entropy_with_softmax(pred, label)
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -14,7 +14,7 @@ import pytest
 import megengine.core.ops.builtin as builtin
 import megengine.core.tensor.dtype as dtype
 import megengine.functional as F
 from megengine import Buffer, Parameter, is_cuda_available, tensor
 from megengine import Parameter, Tensor, is_cuda_available, tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.core.autodiff.grad import Grad
 from megengine.core.tensor.utils import make_shape_tuple
@@ -330,7 +330,7 @@ def test_roi_pooling():
 def test_add_update():
    shape = (2, 3)
    v = np.random.random(shape).astype(np.float32)
    b = Buffer(v)
    b = Tensor(v)
    u = F.add_update(b, 1)
    assertTensorClose(u.numpy(), v + 1)
@@ -347,7 +347,7 @@ def test_add_update():
 def test_add_update_params():
    b = np.random.random((2, 3)).astype(np.float32)
    y = Buffer(b)
    y = Tensor(b)
    # @jit.trace
    def f(x):
@@ -355,7 +355,7 @@ def test_add_update_params():
    f(np.zeros((2, 3)).astype(np.float32))
    z = Buffer(np.zeros((2, 3)).astype(np.float32))
    z = Tensor(np.zeros((2, 3)).astype(np.float32))
    F.add_update(y, z, beta=0.1)
    res = f(np.ones((2, 3)).astype(np.float32))
--- a/imperative/python/test/unit/functional/test_tensor.py
+++ b/imperative/python/test/unit/functional/test_tensor.py
@@ -12,7 +12,7 @@ import numpy as np
 import pytest
 import megengine.functional as F
 from megengine import Buffer, Parameter, is_cuda_available, tensor
 from megengine import tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.core.tensor.utils import astensor1d
 from megengine.distributed.helper import get_device_count_by_fork
--- a/imperative/python/test/unit/module/test_batchnorm.py
+++ b/imperative/python/test/unit/module/test_batchnorm.py
@@ -14,10 +14,9 @@ import pytest
 import megengine as mge
 import megengine.distributed as dist
 from megengine import tensor
 from megengine import Tensor
 from megengine.core._trace_option import use_tensor_shape
 from megengine.module import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from megengine.tensor import Tensor
 from megengine.test import assertTensorClose
@@ -45,10 +44,8 @@ def test_syncbn():
            return
        dist.init_process_group("localhost", port, nr_ranks, rank, rank)
        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
        data_tensor = tensor([])
        for i in range(steps):
            data_tensor.set_value(data[i])
            yv = bn(data_tensor)
            yv = bn(Tensor(data[i]))
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
@@ -105,7 +102,6 @@ def test_batchnorm():
    bn = BatchNorm1d(nr_chan, momentum=momentum)
    running_mean = np.zeros((1, nr_chan, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1), dtype=np.float32)
    data = tensor([])
    for i in range(3):
        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
@@ -120,8 +116,7 @@ def test_batchnorm():
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -137,7 +132,7 @@ def test_batchnorm():
    var_backup = bn.running_var.numpy()
    bn.training = False
    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
    data.set_value(xv)
    data = Tensor(xv)
    yv1 = bn(data)
    yv2 = bn(data)
    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
@@ -161,7 +156,6 @@ def test_syncbn1d():
    bn = SyncBatchNorm(nr_chan, momentum=momentum)
    running_mean = np.zeros((1, nr_chan, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1), dtype=np.float32)
    data = tensor([])
    for i in range(3):
        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
        mean = np.mean(np.mean(xv, axis=0, keepdims=True), axis=2, keepdims=True)
@@ -176,8 +170,7 @@ def test_syncbn1d():
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -193,7 +186,7 @@ def test_syncbn1d():
    var_backup = bn.running_var.numpy()
    bn.training = False
    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
    data.set_value(xv)
    data = Tensor(xv)
    yv1 = bn(data)
    yv2 = bn(data)
    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
@@ -210,7 +203,6 @@ def test_batchnorm2d():
    bn = BatchNorm2d(nr_chan, momentum=momentum)
    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
    data = tensor([])
    for i in range(3):
        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
@@ -226,8 +218,7 @@ def test_batchnorm2d():
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -239,7 +230,7 @@ def test_batchnorm2d():
    var_backup = bn.running_var.numpy()
    bn.training = False
    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
    data.set_value(xv)
    data = Tensor(xv)
    yv1 = bn(data)
    yv2 = bn(data)
    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
@@ -263,7 +254,6 @@ def test_syncbn2d():
    bn = SyncBatchNorm(nr_chan, momentum=momentum)
    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
    data = tensor([])
    for i in range(3):
        xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
        xv_transposed = np.transpose(xv, [0, 2, 3, 1]).reshape(
@@ -279,8 +269,7 @@ def test_syncbn2d():
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -292,7 +281,7 @@ def test_syncbn2d():
    var_backup = bn.running_var.numpy()
    bn.training = False
    xv = np.random.normal(loc=2.3, size=data_shape).astype(np.float32)
    data.set_value(xv)
    data = Tensor(xv)
    yv1 = bn(data)
    yv2 = bn(data)
    assertTensorClose(yv1.numpy(), yv2.numpy(), max_err=0)
@@ -306,7 +295,6 @@ def test_batchnorm_no_stats():
    nr_chan = 8
    data_shape = (3, nr_chan, 4)
    bn = BatchNorm1d(8, track_running_stats=False)
    data = tensor([])
    for i in range(4):
        if i == 2:
            bn.training = False
@@ -320,8 +308,7 @@ def test_batchnorm_no_stats():
        ).reshape((1, nr_chan, 1))
        sd = np.sqrt(var + bn.eps)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -338,7 +325,6 @@ def test_syncbn_no_stats():
    nr_chan = 8
    data_shape = (3, nr_chan, 4)
    bn = SyncBatchNorm(8, track_running_stats=False)
    data = tensor([])
    for i in range(4):
        if i == 2:
            bn.training = False
@@ -352,8 +338,7 @@ def test_syncbn_no_stats():
        ).reshape((1, nr_chan, 1))
        sd = np.sqrt(var + bn.eps)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -363,7 +348,6 @@ def test_batchnorm2d_no_stats():
    nr_chan = 8
    data_shape = (3, nr_chan, 16, 16)
    bn = BatchNorm2d(8, track_running_stats=False)
    data = tensor([])
    for i in range(4):
        if i == 2:
            bn.training = False
@@ -376,8 +360,7 @@ def test_batchnorm2d_no_stats():
        var = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
        sd = np.sqrt(var + bn.eps)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
@@ -394,7 +377,6 @@ def test_syncbn2d_no_stats():
    nr_chan = 8
    data_shape = (3, nr_chan, 16, 16)
    bn = SyncBatchNorm(8, track_running_stats=False)
    data = tensor([])
    for i in range(4):
        if i == 2:
            bn.training = False
@@ -407,8 +389,7 @@ def test_syncbn2d_no_stats():
        var = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
        sd = np.sqrt(var + bn.eps)
        data.set_value(xv)
        yv = bn(data)
        yv = bn(Tensor(xv))
        yv_expect = (xv - mean) / sd
        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
--- a/imperative/python/test/unit/module/test_external.py
+++ b/imperative/python/test/unit/module/test_external.py
@@ -12,7 +12,7 @@ import numpy as np
 import pytest
 import megengine as mge
 from megengine import tensor
 from megengine import Tensor
 from megengine.module import Module
@@ -35,12 +35,12 @@ def test_cambricon_module():
    with open(model, "rb") as f:
        data = f.read()
        m = MyModule(data)
        inputs = []
        inputs.append(tensor(data=[], dtype=np.float16, device="cambricon0"))
        inputs[0].set_value(np.random.normal(size=(1, 64, 32, 32)).astype(np.float16))
        inp = Tensor(
            np.random.normal((1, 64, 32, 32)).astype(np.float16), device="cambricon0"
        )
        def inference(inps):
            pred = m(inps)
            return pred
        pred = inference(inputs)
        pred = inference([inp])
--- a/imperative/python/test/unit/module/test_module.py
+++ b/imperative/python/test/unit/module/test_module.py
@@ -16,7 +16,7 @@ import pytest
 import megengine as mge
 import megengine.functional as F
 from megengine import Buffer, Parameter, Tensor, tensor
 from megengine import Parameter, Tensor, tensor
 from megengine.module import (
    BatchNorm1d,
    BatchNorm2d,
@@ -196,7 +196,7 @@ class MyModule(Module):
        self.i = self.InnerModule()
        self.bn = BatchNorm2d(4)
        self.param = Parameter(np.ones(1, dtype=np.float32))
        self.buff = Buffer(np.ones(1, dtype=np.float32))
        self.buff = Tensor(np.ones(1, dtype=np.float32))
    def forward(self, x):
        x = self.i(x)
@@ -464,8 +464,7 @@ def test_sequential_named_children():
 def test_state_dict():
    data_shape = (2, 28)
    data = tensor([])
    data.set_value(np.random.random(data_shape))
    data = tensor(np.random.random(data_shape))
    mlp = MLP()
    pred0 = mlp(data)
@@ -542,8 +541,7 @@ def test_shared_param():
 def test_pickle_module():
    data_shape = (2, 28)
    data = tensor([])
    data.set_value(np.random.random(data_shape))
    data = tensor(np.random.random(data_shape))
    mlp = MLP()
    # pickle before forward
    with BytesIO() as fout:
@@ -568,8 +566,7 @@ def test_pickle_module():
@pytest.mark.skip(reason="under development")
 def test_dump_model():
    data_shape = (2, 28)
    data = tensor([])
    data.set_value(np.random.random(data_shape))
    data = Tensor(np.random.random(data_shape))
    mlp = MLP()
    pred = mlp(data)
    f = tempfile.NamedTemporaryFile(delete=False)
--- a/imperative/python/test/unit/module/test_tensor.py
+++ b/imperative/python/test/unit/module/test_tensor.py
@@ -13,7 +13,7 @@ import pytest
 import megengine as mge
 import megengine.functional as F
 from megengine import Buffer, Parameter
 from megengine import Parameter, Tensor
 from megengine.module import Conv2d
 from megengine.test import assertTensorClose
@@ -33,7 +33,7 @@ def test_set_value():
@pytest.mark.skip(reason="fill unsupported")
 def test_fill():
    a = Buffer(np.zeros((2, 3), dtype=np.float32))
    a = Tensor(np.zeros((2, 3), dtype=np.float32))
    a.fill(3)
    assertTensorClose(a.numpy(), np.full((2, 3), 3, dtype=np.float32))
    a.fill(124.568)
@@ -80,7 +80,7 @@ def test_fill():
 # def test_shape_warning():
 #     with Graph() as cg:
 #         cg.set_option("eager_evaluation", False)
 #         b = Buffer(np.ones((2, 3)).astype(np.float32))
 #         b = Tensor(np.ones((2, 3)).astype(np.float32))
 #         with pytest.warns(None) as record:
 #             print(b.shape)
 #         if len(record) != 0:
--- a/imperative/python/test/unit/test_function.py
+++ b/imperative/python/test/unit/test_function.py
@@ -42,11 +42,11 @@ def test_single_input():
            return x
    net = Simple(av)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    opt = optimizer.SGD(net.parameters(), lr=1.0)
    opt.clear_grad()
    with gm.record():
    with gm:
        loss = net()
        gm.backward(loss.sum())
    opt.step()
@@ -81,11 +81,11 @@ def test_multi_input():
            return x
    net = Simple(av, bv)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    opt = optimizer.SGD(net.parameters(), lr=1.0)
    opt.clear_grad()
    with gm.record():
    with gm:
        loss = net()
        gm.backward(loss.sum())
    opt.step()
@@ -121,11 +121,11 @@ def test_multi_output():
            return x + y
    net = Simple(av, bv)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    opt = optimizer.SGD(net.parameters(), lr=1.0)
    opt.clear_grad()
    with gm.record():
    with gm:
        loss = net()
        gm.backward(loss.sum())
    opt.step()
@@ -163,9 +163,9 @@ def test_skip_invalid_grad():
    net = Simple(av, bv)
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim.clear_grad()
    with gm.record():
    with gm:
        loss = net().sum()
        gm.backward(loss)
    optim.step()
@@ -198,10 +198,10 @@ def test_ste():
    av = np.random.random(data_shape).astype(np.float32)
    net = Simple(av)
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim.clear_grad()
    with gm.record():
    with gm:
        loss = net()
        gm.backward(loss.sum())
    optim.step()
@@ -256,9 +256,9 @@ def test_none_in_out_grad():
    b = tensor(np.array([2.0], dtype=np.float32))
    net = Simple(a, b)
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim.clear_grad()
    with gm.record():
    with gm:
        loss, _ = net()
        gm.backward(loss)
    optim.step()
@@ -293,10 +293,10 @@ def test_zero_grad():
    a = tensor(np.array([1.0], dtype=np.float32))
    net = Simple(a)
    optim = optimizer.SGD(net.parameters(), lr=1.0)
    gm = ad.GradManager().register(net.parameters())
    gm = ad.GradManager().attach(net.parameters())
    optim.clear_grad()
    with gm.record():
    with gm:
        loss = net()
        gm.backward(loss.sum())
    optim.step()
--- a/imperative/python/test/unit/test_indexing_op.py
+++ b/imperative/python/test/unit/test_indexing_op.py
@@ -38,7 +38,7 @@ def cvt_to_shape_desc(val, inpvar, config=None):
    if isinstance(val, RawTensor):
        return as_tensor(val, device)
    if not isinstance(val, collections.Iterable):
    if not isinstance(val, collections.abc.Iterable):
        val = [val]
    components = []
--- a/imperative/python/test/unit/test_serialization.py
+++ b/imperative/python/test/unit/test_serialization.py
@@ -12,19 +12,18 @@ from tempfile import TemporaryFile
 import numpy as np
 import megengine as mge
 from megengine import Buffer, Parameter, tensor
 from megengine import Parameter, Tensor
 def test_tensor_serialization():
    def tensor_eq(a, b):
        assert a.dtype == b.dtype
        assert a.device == b.device
        assert a.requires_grad == b.requires_grad
        np.testing.assert_equal(a.numpy(), b.numpy())
    with TemporaryFile() as f:
        data = np.random.randint(low=0, high=7, size=[233])
        a = tensor(data, device="xpux", dtype=np.int32)
        a = Tensor(data, device="xpux", dtype=np.int32)
        pickle.dump(a, f)
        f.seek(0)
        b = pickle.load(f)
@@ -39,19 +38,19 @@ def test_tensor_serialization():
        np.testing.assert_equal(a.numpy(), b.numpy())
    with TemporaryFile() as f:
        a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        pickle.dump(a, f)
        f.seek(0)
        b = pickle.load(f)
        assert isinstance(b, Buffer)
        assert type(b) is Tensor
        np.testing.assert_equal(a.numpy(), b.numpy())
    with TemporaryFile() as f:
        a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
        a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
        mge.save(a, f)
        f.seek(0)
        b = mge.load(f, map_location="cpux")
        assert isinstance(b, Buffer)
        assert type(b) is Tensor
        assert "cpu" in str(b.device)
        np.testing.assert_equal(a.numpy(), b.numpy())
@@ -59,12 +58,12 @@ def test_tensor_serialization():
        if mge.is_cuda_available():
            device_org = mge.get_default_device()
            mge.set_default_device("gpu0")
            a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
            a = Tensor(np.random.random(size=(2, 233)).astype(np.float32))
            mge.save(a, f)
            f.seek(0)
            mge.set_default_device("cpux")
            b = mge.load(f, map_location={"gpu0": "cpu0"})
            assert isinstance(b, Buffer)
            assert type(b) is Tensor
            assert "cpu0" in str(b.device)
            np.testing.assert_equal(a.numpy(), b.numpy())
            mge.set_default_device(device_org)
--- a/sdk/xor-deploy/xornet.py
+++ b/sdk/xor-deploy/xornet.py
@@ -66,7 +66,7 @@ def main():
        mge.set_default_device("cpux")
    net = XORNet()
    opt = optim.SGD(net.parameters(requires_grad=True), lr=0.01, momentum=0.9)
    opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    batch_size = 64
    train_dataset = minibatch_generator(batch_size)
    val_dataset = minibatch_generator(batch_size)