diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py index 609dc2ac..743ef1be 100644 --- a/imperative/python/megengine/autodiff/grad_manager.py +++ b/imperative/python/megengine/autodiff/grad_manager.py @@ -23,21 +23,51 @@ class AttachSpec: class GradManager: r""" - GradManager manages auto differentiation and all resources required to perform it. + GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode + automatic differentiation (a.k.a. back propagation). - Our auto differentiation framework requires that the user explicitly indicates when - the forward operations start and when all resources should be released. A typical usage of - GradManager is as follows: + Reverse mode autodiff normally reuses many intermediate tensors for best computation efficiency. + In a read-eval-print-loop (REPL) environment however, it is impossible to known how the user + would take gradients later thus which tensors to keep. To solve this problem, the user must + somehow declare beforehand which gradient could possibly be taken. With GradManager, users are + required to call the :meth:`attach` method on a tensor if they want to take gradients with + respect to it later. Furthermore, any computation on a tensor before it is attached is + completely ignored from the autodiff perspective, so :meth:`attach` must be called before any + computation that needs differentiation. + + For example, the following symbolic differentiation code + + .. code-block:: + + x = get_x() + y = f(x) + dy = ones_like(y) + dx = vjp(y, x, dy) # vector-Jacobian product + + can be rewriten using GradManager for REPL environment as + + .. code-block:: + + with GradManager() as gm: + x = get_x() + gm.attach(x) # must be placed before any computation on x that needs differentiation + y = f(x) + dy = ones_like(y) + gm.backward(y, dy) # doesn't need x, already known via attach() + dx = x.grad # backward() saves result to .grad attribute + + A more realistic example of training a neural network would be like .. code-block:: gm = GradManager() gm.attach(model.parameters()) - with gm: - # forward operations - ... - # backward gradients - gm.backward(loss) + + for data in dataset: + with gm: + loss = model(data) + gm.backward(loss) + # gradients w.r.t. parameters is accumulated into their .grad attributes You can also use ``record()`` and ``release()`` method instead of ``with`` context: @@ -46,14 +76,29 @@ class GradManager: gm = GradManager() gm.attach(model.parameters()) - gm.record() - - # forward operations - ... - # backward gradients - gm.backward(loss) - - gm.release() + for data in dataset: + gm.record() + loss = model(data) + gm.backward(loss) + # backward() will clear recorded history and free resources + # call release() if backward() is not called + # gm.release() + + For your convenience, GradManager may (not must) be reused. As shown in the examples, you + only need to attach a tensor once and GradManager will remember it afterwards. + However, a single GradManager can record only one computation history at a time. To run + multiple differentiations simultaneously or perform high order differentiation, create + as many GradManager as you need. + + .. note:: + + Mutable tensors introduce ambiguities when doing symbolic differentiation: which version + of the tensor are we referring to? For attached tensors, GradManager resolves this + ambiguity by "snapshoting" them on first encounter, either on :meth:`record` (or entering + with statement) if tensor is attached before :meth:`record`, or on :meth:`attach` if + GradManager is already recording. Attached tensors will then be interpreted as their + snapshotted version for differentiation purpose. The same ambiguity on the first parameter + of :meth:`backward` is simply resolved by using the latest version. Typically, in data parallel, we would like to average the gradients across processes. Users will finally get the averaged gradients if an "AllReduce" @@ -77,17 +122,59 @@ class GradManager: def attach(self, tensors: list, callbacks=None): r""" - Registers parameters that gradients should be calculated with respect to. - Callback Functions should have a signature like this: + Instruct GradManager to track operations on tensors, so that gradients with respect + to those tensors could be evaluated later. + + :meth:`attach` also accepts a list of callbacks, which will be called with the tensor and + its gradient during :meth:`backward`. The signature of callbacks should look like: .. code-block:: - def cb(param: Tensor, grad: Tensor) -> Tensor: - # do something + def callback(tensor: Tensor, grad: Tensor) -> Tensor: + ... + # returned grad is passed to subsequent callbacks + # and finally accumulated to the .grad attribute of tensor return grad - :param params: to be registered parameters - :param callbacks: list of callback functions + :meth:`attach` calls with overlapping tensors will result in their callbacks concatenated, + independently for each tensor. For example, + + .. code-block:: + + gm.attach([x, y], callbacks=[f]) + gm.attach([y], callbacks=[g]) + + is equivalent to + + .. code-block:: + + gm.attach([x], callbacks=[f]) + gm.attach([y], callbacks=[f, g]) + + The effect of :meth:`attach` will persist across multiple uses of the GradManager. When + reusing a GradManager, it is likely a mistake to call :meth:`attach` on the same set of + tensors and callbacks repeatedly, which may grow the callback list indefinitely. + + .. note:: + + When reusing a GradManager, it is sometimes desirable to attach temporary tensors each + time, e.g. for computing gradients of inputs of a neural network. GradManager tries to + accommodate such usages by holding weak references to attached tensors. Most of the + times, this should be enough to prevent resource leak. Unfortunately, there are still + some pitfalls left: + + - Callbacks should not hold strong references, directly or indirectly, to attached + tensors. Any strong reference, including those from callbacks, will prevent + garbage collection (even by the cycle collector!) of a attached tensor, until + the GradManager object is garbage collected. + + Please also note that GradManager might hold additional strong references to attached + tensors when it is in use. This note only covers potential resource leaks across + multiple uses of a GradManager, which is unrelated to whether resources is timely + released within a single use. + + :param tensors: tensor or list of tensors to track + :param callbacks: callback or list of callbacks """ if callbacks is None: callbacks = [] @@ -127,10 +214,30 @@ class GradManager: def backward(self, y=None, dy=None): r""" - Performs back-propagation and computes gradients. - - :param ys: outputs of forward operators, e.g., the loss tensor - :param dys: derivatives of ys + Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to + corresponding .grad attribute, and release resources along the way. + + :meth:`backward` computes the vector-Jacobian product :math:`dx_j = \sum_{i} dy_i J_{ij}` + where :math:`J_{ij} = ∂y_i/∂x_j` is the Jacobian matrix between vector variables :math:`y` + and :math:`x`, with all vectors involved represented as a list of tensors, in the sense of + direct sums (or flatten-and-concatenate). :math:`y` and :math:`dy` are passed as the first + and second parameter respectively, whereas :math:`x` is directly taken from the list of + all attached tensors. The result :math:`dx` is also not returned. Instead, it is directly + accumulated into the .grad attribute of matching attached tensors (a.k.a. :math:`x`). This + can be done unambiguously since :math:`dx` as a list of tensors has the same structure as + :math:`x`. + + If :math:`y` is a scalar and :math:`dy` is chosen to be 1, the vector-Jacobian product + yield gradient of :math:`y` with repect to :math:`x` as a special case. In that case, + you will be able to omit the :math:`dy` parameter and :meth:`backward` will automatically + use 1 for it and compute the gradient. + + :meth:`backward` consumes all resources held by this GradManager and releases them in the + process of this call. When the call successfully finishes, the GradManager will be put back + to an inactive state. + + :param y: tensor or list of tensors + :param dy: tensor or list of tensors. Defaults to 1 if y is scalar """ from ..functional import ones_like @@ -144,14 +251,18 @@ class GradManager: "call a method that clears the history?" ) assert self._grad is not None - if ys is None: + if y is None: ys = [] - if not isinstance(ys, (tuple, list)): - ys = [ys] - if dys is None: + elif isinstance(y, (tuple, list)): + ys = y + else: + ys = [y] + if dy is None: dys = [ones_like(y) for y in ys] - if not isinstance(dys, (tuple, list)): - dys = [dys] + elif isinstance(dy, (tuple, list)): + dys = ys + else: + dys = [dy] try: self._grad(ys, dys) for callback in self._after_backward_callback: @@ -172,7 +283,9 @@ class GradManager: def record(self): r""" - Starts recording forward operations. + Start recording operations + + After this call, you will be able to call :meth:`backward`. """ if self._recording: raise RuntimeError("already recording") @@ -198,7 +311,9 @@ class GradManager: def release(self): r""" - Stops recording and releases resources for gradients calculation. + Stop recording operations and release resources kept for gradient computation + + After this call, you will not be able to call :meth:`backward`. """ if self._grad is not None: self._grad.__exit__(None, None, None)