| @@ -20,42 +20,42 @@ class GradManager: | |||
| the forward operations start and when all resources should be released. A typical usage of | |||
| GradManager is as follows: | |||
| .. code-block:: | |||
| .. code-block:: | |||
| gm = GradManager() | |||
| gm.attach(model.parameters()) | |||
| with gm: | |||
| # forward operations | |||
| ... | |||
| # backward gradients | |||
| gm.backward(loss) | |||
| gm = GradManager() | |||
| gm.attach(model.parameters()) | |||
| with gm: | |||
| # forward operations | |||
| ... | |||
| # backward gradients | |||
| gm.backward(loss) | |||
| You can also use `record()` and `release()` method instead of `with` context: | |||
| You can also use ``record()`` and ``release()`` method instead of ``with`` context: | |||
| .. code-block:: | |||
| .. code-block:: | |||
| gm = GradManager() | |||
| gm.attach(model.parameters()) | |||
| gm = GradManager() | |||
| gm.attach(model.parameters()) | |||
| gm.record() | |||
| gm.record() | |||
| # forward operations | |||
| ... | |||
| # backward gradients | |||
| gm.backward(loss) | |||
| # forward operations | |||
| ... | |||
| # backward gradients | |||
| gm.backward(loss) | |||
| gm.release() | |||
| gm.release() | |||
| Typically, in data parallel, we would like to average the gradients across | |||
| processes. Users will finally get the averaged gradients if an "AllReduce" | |||
| callback is registered as follows: | |||
| .. code-block:: | |||
| .. code-block:: | |||
| import megengine.distributed as dist | |||
| import megengine.distributed as dist | |||
| gm = GradManager() | |||
| gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||
| gm = GradManager() | |||
| gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||
| """ | |||
| @@ -50,7 +50,6 @@ class DataLoader: | |||
| :param dataset: dataset from which to load the minibatch. | |||
| :type sampler: Sampler | |||
| :param sampler: defines the strategy to sample data from the dataset. | |||
| If specified, :attr:`shuffle` must be ``False``. | |||
| :type transform: Transform | |||
| :param transform: defined the transforming strategy for a sampled batch. | |||
| Default: None | |||
| @@ -17,4 +17,4 @@ from . import distributed # isort:skip | |||
| # delete namespace | |||
| # pylint: disable=undefined-variable | |||
| # del elemwise, graph, loss, math, nn, tensor # type: ignore[name-defined] | |||
| del elemwise, graph, loss, math, nn, quantized, tensor, utils # type: ignore[name-defined] | |||
| @@ -127,9 +127,10 @@ def cross_entropy( | |||
| with_logits: bool = True, | |||
| label_smooth: float = 0, | |||
| ) -> Tensor: | |||
| r"""Compute the multi-class cross entropy loss (using logits by default). | |||
| r"""Computes the multi-class cross entropy loss (using logits by default). | |||
| By default, prediction is assumed to be logits, whose softmax gives probabilities. | |||
| By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||
| class probabilities are given by softmax. | |||
| It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`. | |||
| @@ -194,9 +195,10 @@ def cross_entropy( | |||
| def binary_cross_entropy( | |||
| pred: Tensor, label: Tensor, with_logits: bool = True | |||
| ) -> Tensor: | |||
| r"""Compute the binary cross entropy loss (using logits by default). | |||
| r"""Computes the binary cross entropy loss (using logits by default). | |||
| By default, prediction is assumed to be logits, whose sigmoid gives probabilities. | |||
| By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||
| class probabilities are given by sigmoid. | |||
| :param pred: `(N, *)`, where `*` means any number of additional dimensions. | |||
| :param label: `(N, *)`, same shape as the input. | |||
| @@ -335,8 +335,8 @@ def adaptive_max_pool2d( | |||
| Refer to :class:`~.MaxAdaptivePool2d` for more information. | |||
| :param inp: The input tensor. | |||
| :param oshp: (OH, OW) size of the output shape. | |||
| :param inp: input tensor. | |||
| :param oshp: `(OH, OW)` size of the output shape. | |||
| :return: output tensor. | |||
| """ | |||
| assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | |||
| @@ -356,8 +356,8 @@ def adaptive_avg_pool2d( | |||
| Refer to :class:`~.AvgAdaptivePool2d` for more information. | |||
| :param inp: The input tensor. | |||
| :param oshp: (OH, OW) size of the output shape. | |||
| :param inp: input tensor. | |||
| :param oshp: `(OH, OW)` size of the output shape. | |||
| :return: output tensor. | |||
| """ | |||
| assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | |||
| @@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd): | |||
| \text{stride[1]} \times w + n) | |||
| \end{aligned} | |||
| Kernel_size and stride can be inferred from input shape and out shape: | |||
| padding: (0, 0) | |||
| stride: (floor(IH / OH), floor(IW / OW)) | |||
| kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
| ``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||
| * padding: (0, 0) | |||
| * stride: (floor(IH / OH), floor(IW / OW)) | |||
| * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
| Examples: | |||
| @@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd): | |||
| out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} | |||
| input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) | |||
| Kernel_size and stride can be inferred from input shape and out shape: | |||
| padding: (0, 0) | |||
| stride: (floor(IH / OH), floor(IW / OW)) | |||
| kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
| ``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||
| * padding: (0, 0) | |||
| * stride: (floor(IH / OH), floor(IW / OW)) | |||
| * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
| Examples: | |||
| @@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta): | |||
| def replace_param( | |||
| self, params: dict, start_pos: int, seen: Optional[Set[int]] = None | |||
| ): | |||
| """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to | |||
| """Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to | |||
| speedup multimachine training. | |||
| """ | |||
| offset = 0 | |||
| @@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta): | |||
| If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys | |||
| returned by :func:`state_dict`. | |||
| Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]` | |||
| Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]`` | |||
| as a `state_dict`, in order to handle complex situations. For example, load everything | |||
| except for the final linear classifier: | |||
| @@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta): | |||
| for k, v in state_dict.items() | |||
| }, strict=False) | |||
| Here returning `None` means skipping parameter `k`. | |||
| Here returning ``None`` means skipping parameter ``k``. | |||
| To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading: | |||
| @@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta): | |||
| ) | |||
| def _load_state_dict_with_closure(self, closure): | |||
| """Advance state_dict load through callable `closure` whose signature is | |||
| `closure(key: str, var: Tensor) -> Union[np.ndarry, None]` | |||
| """Advance state_dict load through callable ``closure`` whose signature is | |||
| ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]`` | |||
| """ | |||
| assert callable(closure), "closure must be a function" | |||