Browse Source

perf(mge/optimizer): close conver_inputs for optimizer step

GitOrigin-RevId: c710530d93
tags/v1.1.0
Megvii Engine Team 5 years ago
parent
commit
1fed59293b
6 changed files with 94 additions and 29 deletions
  1. +22
    -0
      imperative/python/megengine/core/tensor/utils.py
  2. +19
    -9
      imperative/python/megengine/optimizer/adadelta.py
  3. +17
    -7
      imperative/python/megengine/optimizer/adagrad.py
  4. +18
    -8
      imperative/python/megengine/optimizer/adam.py
  5. +6
    -0
      imperative/python/megengine/optimizer/optimizer.py
  6. +12
    -5
      imperative/python/megengine/optimizer/sgd.py

+ 22
- 0
imperative/python/megengine/core/tensor/utils.py View File

@@ -16,6 +16,25 @@ from ..ops.special import Const
from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
from .dtype import is_equal, is_quantize from .dtype import is_equal, is_quantize


_enable_convert_inputs = True


def get_convert_inputs():
""" get the curerent state of `_enable_convert_inputs` """
return _enable_convert_inputs


def set_convert_inputs(flag):
""" This function is a temporary workaround for reducing the overhead of operator
invocations. The function `convert_inputs` is disabled if the global state
`_enable_convert_inputs` is set to `False`, otherwise enabled. This function is for
internal use only, and should be removed when the tensor-like system is refactored.
"""
global _enable_convert_inputs
backup = _enable_convert_inputs
_enable_convert_inputs = flag
return backup



def dtype_promotion(inputs): def dtype_promotion(inputs):
""" """
@@ -129,6 +148,9 @@ def convert_single_value(v, inputs, *, dtype=None, device=None):




def convert_inputs(*args: TensorBase): def convert_inputs(*args: TensorBase):
if not _enable_convert_inputs:
return args

dtype = dtype_promotion(args) dtype = dtype_promotion(args)
device = get_device(args) device = get_device(args)




+ 19
- 9
imperative/python/megengine/optimizer/adadelta.py View File

@@ -10,8 +10,8 @@ from typing import Iterable, Union


import numpy as np import numpy as np


from ..functional import sqrt
from ..tensor import Parameter
from ..core.tensor.tensor import Tensor
from ..tensor import Parameter, tensor
from .optimizer import Optimizer from .optimizer import Optimizer




@@ -62,6 +62,16 @@ class Adadelta(Optimizer):
rho = param_group["rho"] rho = param_group["rho"]
eps = param_group["eps"] eps = param_group["eps"]


# since `conver_inputs` is disabled for param updates,
# scalar should be explicitly tansforred to tensor
_lr = tensor([lr])
_weight_decay = tensor([weight_decay])
_rho = tensor([rho])
_eps = tensor([eps])

c05 = tensor([0.5])
c1 = tensor([1.0])
c2 = tensor([2.0])
for param in param_group["params"]: for param in param_group["params"]:


if param.grad is None: if param.grad is None:
@@ -69,17 +79,17 @@ class Adadelta(Optimizer):


states = self._state[param] states = self._state[param]
step = states["step"] step = states["step"]
step += 1.0
step += c1
grad = param.grad grad = param.grad
if weight_decay != 0.0: if weight_decay != 0.0:
grad += param * weight_decay
grad += param * _weight_decay


square_avg = states["square_avg"] square_avg = states["square_avg"]
acc_delta = states["acc_delta"] acc_delta = states["acc_delta"]
square_avg = rho * square_avg + (1 - rho) * grad ** 2
std = sqrt(square_avg + eps)
delta = sqrt(acc_delta + eps) / std * grad
param -= lr * delta
acc_delta = rho * acc_delta + (1 - rho) * delta ** 2
square_avg = _rho * square_avg + (c1 - _rho) * grad ** c2
std = (square_avg + _eps) ** c05
delta = (acc_delta + _eps) ** c05 / std * grad
param -= _lr * delta
acc_delta = _rho * acc_delta + (c1 - _rho) * delta ** c2
states["square_avg"]._reset(square_avg) states["square_avg"]._reset(square_avg)
states["acc_delta"]._reset(acc_delta) states["acc_delta"]._reset(acc_delta)

+ 17
- 7
imperative/python/megengine/optimizer/adagrad.py View File

@@ -10,8 +10,8 @@ from typing import Iterable, Union


import numpy as np import numpy as np


from ..functional import sqrt
from ..tensor import Parameter
from ..core.tensor.tensor import Tensor
from ..tensor import Parameter, tensor
from .optimizer import Optimizer from .optimizer import Optimizer




@@ -61,6 +61,16 @@ class Adagrad(Optimizer):
weight_decay = param_group["weight_decay"] weight_decay = param_group["weight_decay"]
eps = param_group["eps"] eps = param_group["eps"]


# since `conver_inputs` is disabled for param updates,
# scalar should be explicitly tansforred to tensor
_lr = tensor([lr])
_lr_decay = tensor([lr_decay])
_weight_decay = tensor([weight_decay])
_eps = tensor([eps])

c05 = tensor([0.5])
c1 = tensor([1.0])
c2 = tensor([2.0])
for param in param_group["params"]: for param in param_group["params"]:


if param.grad is None: if param.grad is None:
@@ -68,14 +78,14 @@ class Adagrad(Optimizer):


states = self._state[param] states = self._state[param]
step = states["step"] step = states["step"]
step += 1.0
step += c1
grad = param.grad grad = param.grad
if weight_decay != 0.0: if weight_decay != 0.0:
grad += param * weight_decay
grad += param * _weight_decay


square_avg = states["square_avg"] square_avg = states["square_avg"]
square_avg += grad ** 2
delta = grad / sqrt(square_avg + eps)
clr = lr / (1 + (step - 1) * lr_decay)
square_avg += grad ** c2
delta = grad / (square_avg + _eps) ** c05
clr = _lr / (c1 + (step - c1) * _lr_decay)


param -= clr * delta param -= clr * delta

+ 18
- 8
imperative/python/megengine/optimizer/adam.py View File

@@ -8,7 +8,8 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from typing import Iterable, Tuple, Union from typing import Iterable, Tuple, Union


from ..tensor import Parameter
from ..core.tensor.tensor import Tensor
from ..tensor import Parameter, tensor
from .optimizer import Optimizer from .optimizer import Optimizer




@@ -58,6 +59,15 @@ class Adam(Optimizer):
eps = param_group["eps"] eps = param_group["eps"]
beta0, beta1 = param_group["betas"] beta0, beta1 = param_group["betas"]


# since `conver_inputs` is disabled for param updates,
# scalar should be explicitly tansforred to tensor
_lr = tensor([lr])
_weight_decay = tensor([weight_decay])
_eps = tensor([eps])
_beta0, _beta1 = tensor([beta0]), tensor([beta1])

c1 = tensor([1.0])
c05 = tensor([0.5])
for param in param_group["params"]: for param in param_group["params"]:


if param.grad is None: if param.grad is None:
@@ -65,20 +75,20 @@ class Adam(Optimizer):


grad = param.grad grad = param.grad
if weight_decay != 0.0: if weight_decay != 0.0:
grad += param * weight_decay
grad += param * _weight_decay


states = self._state[param] states = self._state[param]
step = states["step"] step = states["step"]
step += 1.0
step += c1
exp_avg = states["exp_avg"] exp_avg = states["exp_avg"]
exp_avg_sq = states["exp_avg_sq"] exp_avg_sq = states["exp_avg_sq"]
exp_avg = beta0 * exp_avg + grad * (1 - beta0)
exp_avg_sq = beta1 * exp_avg_sq + (1 - beta1) * (grad * grad)
exp_avg = _beta0 * exp_avg + grad * (c1 - _beta0)
exp_avg_sq = _beta1 * exp_avg_sq + (c1 - _beta1) * (grad * grad)


delta = (exp_avg / (1 - beta0 ** step)) / (
(exp_avg_sq / (1 - beta1 ** step)) ** 0.5 + eps
delta = (exp_avg / (c1 - _beta0 ** step)) / (
(exp_avg_sq / (c1 - _beta1 ** step)) ** c05 + _eps
) )
param -= lr * delta
param -= _lr * delta


# not inplace change, need to update underlying tensor handler in state # not inplace change, need to update underlying tensor handler in state
states["exp_avg"]._reset(exp_avg) states["exp_avg"]._reset(exp_avg)


+ 6
- 0
imperative/python/megengine/optimizer/optimizer.py View File

@@ -15,6 +15,7 @@ from typing import Union


import numpy as np import numpy as np


from ..core.tensor.utils import set_convert_inputs
from ..tensor import Parameter, Tensor from ..tensor import Parameter, Tensor
from ..utils.deprecation import deprecated from ..utils.deprecation import deprecated


@@ -143,6 +144,9 @@ class Optimizer(metaclass=ABCMeta):
Performs a single optimization step. Performs a single optimization step.


""" """
# set the globle state `_enable_convert_inputs` to `False` to disable
# the `convert_inputs` for param updates
backup = set_convert_inputs(False)
for group in self.param_groups: for group in self.param_groups:
if isinstance(group["params"], set): if isinstance(group["params"], set):
raise TypeError( raise TypeError(
@@ -151,6 +155,8 @@ class Optimizer(metaclass=ABCMeta):
"Please use a list instead." "Please use a list instead."
) )
self._updates(group) self._updates(group)
# restore the globle state `_enable_convert_inputs`
set_convert_inputs(backup)
return self return self


@deprecated(version="1.0", reason="use clear_grad instead") @deprecated(version="1.0", reason="use clear_grad instead")


+ 12
- 5
imperative/python/megengine/optimizer/sgd.py View File

@@ -8,7 +8,8 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from typing import Iterable, Union from typing import Iterable, Union


from ..tensor import Parameter
from ..core.tensor.tensor import Tensor
from ..tensor import Parameter, tensor
from .optimizer import Optimizer from .optimizer import Optimizer




@@ -52,18 +53,24 @@ class SGD(Optimizer):
weight_decay = param_group["weight_decay"] weight_decay = param_group["weight_decay"]
momentum = param_group["momentum"] momentum = param_group["momentum"]


# since `conver_inputs` is disabled for param updates,
# scalar should be explicitly tansforred to tensor
_lr = tensor([lr])
_weight_decay = tensor([weight_decay])
_momentum = tensor([momentum])

for param in param_group["params"]: for param in param_group["params"]:
if param.grad is None: if param.grad is None:
continue continue


grad = param.grad grad = param.grad
if weight_decay != 0.0: if weight_decay != 0.0:
grad += param * weight_decay
grad += param * _weight_decay


if momentum: if momentum:
v = self._state[param]["momentum_buffer"] v = self._state[param]["momentum_buffer"]
v = momentum * v + grad
param -= lr * v
v = _momentum * v + grad
param -= _lr * v
self._state[param]["momentum_buffer"]._reset(v) self._state[param]["momentum_buffer"]._reset(v)
else: else:
param -= lr * grad
param -= _lr * grad

Loading…
Cancel
Save