GitOrigin-RevId: 52ddd805b4
tags/v1.10.0
| @@ -175,13 +175,13 @@ struct MaxOp<src_ctype, dst_ctype, dt_float32> { | |||||
| : INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {} | : INIT(wtype(DTypeTrait<wtype>::min())), src(src), dst(dst), B(B) {} | ||||
| }; | }; | ||||
| template <typename src_ctype, typename index_ctype, typename dst_ctype, typename wtype_> | |||||
| template <typename src_ctype, typename dst_ctype, typename wtype_> | |||||
| struct CheckNonFiniteOp { | struct CheckNonFiniteOp { | ||||
| typedef wtype_ wtype; | typedef wtype_ wtype; | ||||
| const wtype INIT; | const wtype INIT; | ||||
| src_ctype** srcs; | src_ctype** srcs; | ||||
| index_ctype* srcs_total_nr_elems; | |||||
| size_t* srcs_total_nr_elems; | |||||
| dst_ctype* dst; | dst_ctype* dst; | ||||
| const size_t B; | const size_t B; | ||||
| const src_ctype scale; | const src_ctype scale; | ||||
| @@ -206,7 +206,7 @@ struct CheckNonFiniteOp { | |||||
| return lhs | rhs; | return lhs | rhs; | ||||
| } | } | ||||
| MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp( | MEGDNN_HOST MEGDNN_DEVICE CheckNonFiniteOp( | ||||
| src_ctype** srcs, index_ctype* srcs_total_nr_elems, dst_ctype* dst, | |||||
| src_ctype** srcs, size_t* srcs_total_nr_elems, dst_ctype* dst, | |||||
| size_t B, src_ctype scale) | size_t B, src_ctype scale) | ||||
| : INIT(wtype(0)), | : INIT(wtype(0)), | ||||
| srcs(srcs), | srcs(srcs), | ||||
| @@ -8,10 +8,10 @@ namespace cuda { | |||||
| #define COMMA , | #define COMMA , | ||||
| #define cb(_dtype) \ | |||||
| INST_REDUCE( \ | |||||
| device_reduce::CheckNonFiniteOp< \ | |||||
| _dtype COMMA size_t COMMA dt_int32 COMMA dt_int32>, \ | |||||
| #define cb(_dtype) \ | |||||
| INST_REDUCE( \ | |||||
| device_reduce::CheckNonFiniteOp< \ | |||||
| _dtype COMMA dt_float32 COMMA dt_int32 COMMA dt_int32>, \ | |||||
| false); | false); | ||||
| cb(dt_float32); | cb(dt_float32); | ||||
| @@ -10,11 +10,11 @@ namespace megdnn { | |||||
| namespace cuda { | namespace cuda { | ||||
| using device_reduce::CheckNonFiniteOp; | using device_reduce::CheckNonFiniteOp; | ||||
| #define total_nr_elems_max 2048 | |||||
| #define total_nr_elems_max 8192 | |||||
| template <typename T> | template <typename T> | ||||
| size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | size_t CheckNonFiniteImpl::_get_workspace_in_bytes() { | ||||
| // Call the _get_workspace_in_bytes to reduce the loop fetch workspace bytes | // Call the _get_workspace_in_bytes to reduce the loop fetch workspace bytes | ||||
| typedef CheckNonFiniteOp<T, size_t, dt_int32, dt_int32> Op; | |||||
| typedef CheckNonFiniteOp<T, dt_float32, dt_int32, dt_int32> Op; | |||||
| megdnn_assert(m_size > 0); | megdnn_assert(m_size > 0); | ||||
| WorkspaceBundle bundle( | WorkspaceBundle bundle( | ||||
| nullptr, { | nullptr, { | ||||
| @@ -59,7 +59,7 @@ void CheckNonFiniteImpl::_exec( | |||||
| _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) { | _megdnn_workspace workspace) { | ||||
| check_exec(srcs, dst, workspace.size); | check_exec(srcs, dst, workspace.size); | ||||
| typedef CheckNonFiniteOp<T, size_t, dt_int32, dt_int32> Op; | |||||
| typedef CheckNonFiniteOp<T, dt_float32, dt_int32, dt_int32> Op; | |||||
| auto stream = cuda_stream(this->handle()); | auto stream = cuda_stream(this->handle()); | ||||
| SmallVector<size_t> workspace_sizes{ | SmallVector<size_t> workspace_sizes{ | ||||
| sizeof(T*) * m_size, | sizeof(T*) * m_size, | ||||
| @@ -102,7 +102,7 @@ void CheckNonFiniteImpl::_exec( | |||||
| cuda_check(cudaStreamAddCallback( | cuda_check(cudaStreamAddCallback( | ||||
| stream, callback_free, static_cast<void*>(workspace_cpu_raw), 0)); | stream, callback_free, static_cast<void*>(workspace_cpu_raw), 0)); | ||||
| return run_reduce<Op, false>( | |||||
| run_reduce<Op, false>( | |||||
| static_cast<dt_int32*>( | static_cast<dt_int32*>( | ||||
| (void*)((char*)workspace_gpu_raw + | (void*)((char*)workspace_gpu_raw + | ||||
| workspace_gpu.total_size_in_bytes())), | workspace_gpu.total_size_in_bytes())), | ||||
| @@ -141,8 +141,10 @@ class GradScaler: | |||||
| tensor.grad = None | tensor.grad = None | ||||
| return self | return self | ||||
| def _check_gradients(self, grad, scale): | |||||
| return _check_non_finite(grad, scale) | |||||
| def _check_gradients(self, grads, scale): | |||||
| if len(grads) == 0: | |||||
| return False | |||||
| return _check_non_finite(grads, scale) | |||||
| def update(self, new_scale: float = None): | def update(self, new_scale: float = None): | ||||
| r"""Update the scale factor according to whether encountered overflow grad. | r"""Update the scale factor according to whether encountered overflow grad. | ||||
| @@ -691,11 +691,13 @@ def _check_non_finite(inps: Iterable[Tensor], scale=1.0) -> Tensor: | |||||
| r"""Check whether input contains infinite or nan value. | r"""Check whether input contains infinite or nan value. | ||||
| Args: | Args: | ||||
| inp: a tensor to be checked. | |||||
| inps: tensors to be checked. | |||||
| Returns: | Returns: | ||||
| a int32 scalar tensor, 0 for False and 1 for True. | a int32 scalar tensor, 0 for False and 1 for True. | ||||
| """ | """ | ||||
| if isinstance(inps, Tensor): | |||||
| inps = [inps] | |||||
| op = builtin.CheckNonFinite(scale=scale) | op = builtin.CheckNonFinite(scale=scale) | ||||
| oups = apply(op, *inps) | oups = apply(op, *inps) | ||||
| out = oups[-1] | out = oups[-1] | ||||
| @@ -1,4 +1,5 @@ | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | |||||
| import megengine as mge | import megengine as mge | ||||
| from megengine.amp import GradScaler | from megengine.amp import GradScaler | ||||
| @@ -6,23 +7,46 @@ from megengine.autodiff import GradManager | |||||
| from megengine.jit import trace | from megengine.jit import trace | ||||
| def test_grad_scaler(): | |||||
| def f(): | |||||
| gm = GradManager() | |||||
| scaler = GradScaler() | |||||
| x = mge.tensor(1.0) | |||||
| for _ in range(3): | |||||
| with gm: | |||||
| y = x + 1 | |||||
| gm.attach(y) | |||||
| loss = y + 1 | |||||
| scaler.backward(gm, loss, unscale_grad=False) | |||||
| np.testing.assert_equal(y.grad.numpy(), scaler.scale_factor) | |||||
| scaler.unscale(gm.attached_tensors()) | |||||
| np.testing.assert_equal(y.grad.numpy(), 1) | |||||
| # test handle None elements | |||||
| scaler.unscale(gm.attached_tensors()) | |||||
| f() | |||||
| trace(f)() | |||||
| @pytest.mark.parametrize( | |||||
| "is_trace", [False, True], | |||||
| ) | |||||
| def test_grad_scaler(is_trace): | |||||
| gm = GradManager() | |||||
| scaler = GradScaler() | |||||
| def f(idx, data, calc): | |||||
| x = mge.tensor(data, no_cache=True) | |||||
| y = mge.tensor(data, no_cache=True) | |||||
| if is_trace: | |||||
| calc = trace(calc) | |||||
| gm.attach([x, y]) | |||||
| with gm: | |||||
| loss = calc(x, y) | |||||
| scaler.backward(gm, loss, unscale_grad=False) | |||||
| np.testing.assert_equal(x.grad.numpy(), 2 * scaler.scale_factor) | |||||
| scaler.unscale(filter(lambda t: t.grad is not None, gm.attached_tensors())) | |||||
| # scaler.unscale(gm.attached_tensors()) | |||||
| np.testing.assert_equal(x.grad.numpy(), 2) | |||||
| def double_variables(x, y): | |||||
| z = x + 2 * y | |||||
| loss = 2 * z + 1 | |||||
| return loss | |||||
| def single_variable(x, y): | |||||
| z = x + 1 | |||||
| loss = 2 * z + 1 | |||||
| return loss | |||||
| # need grad being unique storage or not inplace modifying grad | |||||
| def double_variables_with_same_grad(x, y): | |||||
| z = x + y | |||||
| loss = 2 * z + 1 | |||||
| return loss | |||||
| for data in [np.random.random((1, 2, 3, 4)), 1.0]: | |||||
| for calc in [double_variables, single_variable, double_variables_with_same_grad]: | |||||
| for idx in range(3): | |||||
| f(idx, data, calc) | |||||