| @@ -76,7 +76,7 @@ class Trainer(TrainerEventTrigger): | |||||
| .. note:: | .. note:: | ||||
| 如果您选择使用 :mod:`deepspeed` 或 :mod:`fairscale` 进行训练,请不要将 ``driver`` 的值设为 ``'auto'`` 。 | |||||
| 如果您选择使用 :mod:`deepspeed` 、:mod:`fairscale` 或 :mod:`torch.distributed.fsdp` 进行训练,请不要将 ``driver`` 的值设为 ``'auto'`` 。 | |||||
| :param train_dataloader: 训练数据集,注意其必须是单独的一个数据集,不能是 :class:`List` 或者 :class:`Dict`; | :param train_dataloader: 训练数据集,注意其必须是单独的一个数据集,不能是 :class:`List` 或者 :class:`Dict`; | ||||
| @@ -3,6 +3,8 @@ __all__ = [ | |||||
| 'TorchDriver', | 'TorchDriver', | ||||
| "TorchSingleDriver", | "TorchSingleDriver", | ||||
| "TorchDDPDriver", | "TorchDDPDriver", | ||||
| "FairScaleDriver", | |||||
| "TorchFSDPDriver", | |||||
| "DeepSpeedDriver", | "DeepSpeedDriver", | ||||
| "PaddleDriver", | "PaddleDriver", | ||||
| "PaddleSingleDriver", | "PaddleSingleDriver", | ||||
| @@ -10,8 +12,6 @@ __all__ = [ | |||||
| "JittorDriver", | "JittorDriver", | ||||
| "JittorSingleDriver", | "JittorSingleDriver", | ||||
| "JittorMPIDriver", | "JittorMPIDriver", | ||||
| 'TorchSingleDriver', | |||||
| 'TorchDDPDriver', | |||||
| 'PaddleDriver', | 'PaddleDriver', | ||||
| 'PaddleSingleDriver', | 'PaddleSingleDriver', | ||||
| 'PaddleFleetDriver', | 'PaddleFleetDriver', | ||||
| @@ -27,7 +27,8 @@ __all__ = [ | |||||
| 'optimizer_state_to_device' | 'optimizer_state_to_device' | ||||
| ] | ] | ||||
| from .torch_driver import TorchDriver, TorchSingleDriver, TorchDDPDriver, DeepSpeedDriver, torch_seed_everything, optimizer_state_to_device | |||||
| from .torch_driver import TorchDriver, TorchSingleDriver, TorchDDPDriver, DeepSpeedDriver, FairScaleDriver, \ | |||||
| TorchFSDPDriver, torch_seed_everything, optimizer_state_to_device | |||||
| from .jittor_driver import JittorDriver, JittorMPIDriver, JittorSingleDriver | from .jittor_driver import JittorDriver, JittorMPIDriver, JittorSingleDriver | ||||
| from .paddle_driver import PaddleDriver, PaddleFleetDriver, PaddleSingleDriver, paddle_seed_everything | from .paddle_driver import PaddleDriver, PaddleFleetDriver, PaddleSingleDriver, paddle_seed_everything | ||||
| from .oneflow_driver import OneflowDriver, OneflowSingleDriver, OneflowDDPDriver, oneflow_seed_everything | from .oneflow_driver import OneflowDriver, OneflowSingleDriver, OneflowDDPDriver, oneflow_seed_everything | ||||
| @@ -10,7 +10,7 @@ def choose_driver(model, driver: Union[str, Driver], device: Optional[Union[int, | |||||
| 根据输入的参数 ``driver`` 和 ``device`` 的格式来决定具体的工作模式。 | 根据输入的参数 ``driver`` 和 ``device`` 的格式来决定具体的工作模式。 | ||||
| :param model: 运行过程中使用的具体的最原始的模型。 | :param model: 运行过程中使用的具体的最原始的模型。 | ||||
| :param driver: 训练模型所使用的具体的驱动模式,应当为以下选择中的一个:``["auto", "torch", "paddle", "jittor", "fairscale", "deepspeed", "oneflow"]``,分别对应 | |||||
| :param driver: 训练模型所使用的具体的驱动模式,应当为以下选择中的一个:``["auto", "torch", "paddle", "jittor", "fairscale", "deepspeed", "oneflow", "torch_fsdp"]``,分别对应 | |||||
| 各种框架。值为 ``'auto'`` 时,将会根据模型的类型进行选择。 | 各种框架。值为 ``'auto'`` 时,将会根据模型的类型进行选择。 | ||||
| :param device: 训练使用的设备。详细的格式可以查阅 :class:`~fastNLP.core.controllers.Trainer` 中的说明。 | :param device: 训练使用的设备。详细的格式可以查阅 :class:`~fastNLP.core.controllers.Trainer` 中的说明。 | ||||
| :param kwargs: 其余的传给 `Driver` 的参数。 | :param kwargs: 其余的传给 `Driver` 的参数。 | ||||
| @@ -2,16 +2,20 @@ __all__ = [ | |||||
| 'TorchDriver', | 'TorchDriver', | ||||
| 'TorchSingleDriver', | 'TorchSingleDriver', | ||||
| 'TorchDDPDriver', | 'TorchDDPDriver', | ||||
| 'airScaleDriver', | |||||
| 'DeepSpeedDriver', | 'DeepSpeedDriver', | ||||
| 'TorchFSDPDriver', | |||||
| 'torch_seed_everything', | 'torch_seed_everything', | ||||
| 'optimizer_state_to_device' | 'optimizer_state_to_device' | ||||
| ] | ] | ||||
| from .ddp import TorchDDPDriver | from .ddp import TorchDDPDriver | ||||
| # todo 实现 fairscale 后再将 fairscale 导入到这里; | # todo 实现 fairscale 后再将 fairscale 导入到这里; | ||||
| from .fairscale import FairScaleDriver | |||||
| from .single_device import TorchSingleDriver | from .single_device import TorchSingleDriver | ||||
| from .torch_driver import TorchDriver | from .torch_driver import TorchDriver | ||||
| from .deepspeed import DeepSpeedDriver | from .deepspeed import DeepSpeedDriver | ||||
| from .torch_fsdp import TorchFSDPDriver | |||||
| from .utils import torch_seed_everything, optimizer_state_to_device | from .utils import torch_seed_everything, optimizer_state_to_device | ||||
| @@ -29,6 +29,29 @@ from .utils import optimizer_state_to_device | |||||
| class FairScaleDriver(TorchDDPDriver): | class FairScaleDriver(TorchDDPDriver): | ||||
| """ | |||||
| 实现 ``fairscale`` 功能的 ``Driver`` 。 | |||||
| :param model: 传入给 ``Trainer`` 的 ``model`` 参数。 | |||||
| :param parallel_device: 用于分布式训练的 ``gpu`` 设备。 | |||||
| :param is_pull_by_torch_run: 标志当前的脚本的启动是否由 ``python -m torch.distributed.launch`` 启动的。 | |||||
| :param fp16: 是否开启 fp16 训练。 | |||||
| :param fairscale_kwargs: | |||||
| * *oss_kwargs* -- | |||||
| * *sdp_kwargs* -- | |||||
| * *fsdp_kwargs* -- | |||||
| * *ddp_kwargs* -- | |||||
| * *set_grad_to_none* -- 是否在训练过程中在每一次 optimizer 更新后将 grad 置为 ``None`` | |||||
| * *non_blocking* -- 表示用于 :meth:`torch.Tensor.to` 方法的参数 non_blocking | |||||
| * *gradscaler_kwargs* -- 用于 ``fp16=True`` 时,提供给 :class:`torch.amp.cuda.GradScaler` 的参数 | |||||
| :kwargs: | |||||
| * *wo_auto_param_call* (``bool``) -- 是否关闭在训练时调用我们的 ``auto_param_call`` 函数来自动匹配 batch 和前向函数的参数的行为 | |||||
| .. note:: | |||||
| 关于该参数的详细说明,请参见 :class:`~fastNLP.core.controllers.Trainer` 中的描述;函数 ``auto_param_call`` 详见 :func:`fastNLP.core.utils.auto_param_call`。 | |||||
| """ | |||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| model, | model, | ||||
| @@ -22,7 +22,7 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi | |||||
| r""" | r""" | ||||
| 用来根据参数 ``driver` 和 ``device`` 来确定并且初始化一个具体的 ``Driver`` 实例然后返回回去; | 用来根据参数 ``driver` 和 ``device`` 来确定并且初始化一个具体的 ``Driver`` 实例然后返回回去; | ||||
| :param driver: 该参数的值应为以下之一:``["torch", "fairscale", "deepspeed"]`` | |||||
| :param driver: 该参数的值应为以下之一:``["torch", "fairscale", "deepspeed", "torch_fsdp"]`` | |||||
| :param device: 该参数的格式与 ``Trainer`` 对参数 ``device`` 的要求一致 | :param device: 该参数的格式与 ``Trainer`` 对参数 ``device`` 的要求一致 | ||||
| :param model: 训练或者评测的具体的模型 | :param model: 训练或者评测的具体的模型 | ||||
| @@ -31,6 +31,7 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi | |||||
| * :class:`~fastNLP.core.drivers.torch_driver.TorchDDPDriver` | * :class:`~fastNLP.core.drivers.torch_driver.TorchDDPDriver` | ||||
| * :class:`~fastNLP.core.drivers.torch_driver.DeepSpeedDriver` | * :class:`~fastNLP.core.drivers.torch_driver.DeepSpeedDriver` | ||||
| * :class:`~fastNLP.core.drivers.torch_driver.FairScaleDriver` | * :class:`~fastNLP.core.drivers.torch_driver.FairScaleDriver` | ||||
| * :class:`~fastNLP.core.drivers.torch_driver.TorchFSDPDriver` | |||||
| """ | """ | ||||
| if parse_version(torch.__version__) < parse_version('1.6'): | if parse_version(torch.__version__) < parse_version('1.6'): | ||||
| raise RuntimeError(f"Pytorch(current version:{torch.__version__}) need to be older than 1.6.") | raise RuntimeError(f"Pytorch(current version:{torch.__version__}) need to be older than 1.6.") | ||||
| @@ -36,15 +36,16 @@ from .utils import optimizer_state_to_device | |||||
| class TorchFSDPDriver(TorchDDPDriver): | class TorchFSDPDriver(TorchDDPDriver): | ||||
| r""" | r""" | ||||
| 实现对于 pytorch 自己实现的 fully sharded data parallel;请阅读该文档了解更多: | |||||
| https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.full_optim_state_dict; | |||||
| 实现对于 pytorch 自己实现的 fully sharded data parallel;请阅读 | |||||
| `该文档 <https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.full_optim_state_dict>`_ | |||||
| 了解更多: | |||||
| ..note:: | |||||
| .. note:: | |||||
| ``TorchFSDPDriver`` 大部分行为与 ``TorchDDPDriver`` 相同,如果您不了解 ``TorchDDPDriver``, | ``TorchFSDPDriver`` 大部分行为与 ``TorchDDPDriver`` 相同,如果您不了解 ``TorchDDPDriver``, | ||||
| 您可以先阅读 :class:`~fastNLP.core.drivers.TorchDDPDriver`; | 您可以先阅读 :class:`~fastNLP.core.drivers.TorchDDPDriver`; | ||||
| ..warning:: | |||||
| .. warning:: | |||||
| ``TorchFSDPDriver`` 现在还不支持断点重训功能,但是支持保存模型和加载模型; | ``TorchFSDPDriver`` 现在还不支持断点重训功能,但是支持保存模型和加载模型; | ||||
| @@ -54,6 +55,23 @@ class TorchFSDPDriver(TorchDDPDriver): | |||||
| 1. save/load_on_rank0 = True:表示在加载和保存模型时将所有 rank 上的模型参数全部聚合到 rank0 上,注意这样可能会造成 OOM; | 1. save/load_on_rank0 = True:表示在加载和保存模型时将所有 rank 上的模型参数全部聚合到 rank0 上,注意这样可能会造成 OOM; | ||||
| 2. save/load_on_rank0 = False:表示每个 rank 分别保存加载自己独有的模型参数; | 2. save/load_on_rank0 = False:表示每个 rank 分别保存加载自己独有的模型参数; | ||||
| :param model: 传入给 ``Trainer`` 的 ``model`` 参数 | |||||
| :param parallel_device: 用于分布式训练的 ``gpu`` 设备 | |||||
| :param is_pull_by_torch_run: 标志当前的脚本的启动是否由 ``python -m torch.distributed.launch`` 启动的 | |||||
| :param fp16: 是否开启 fp16 训练 | |||||
| :param torch_kwargs: | |||||
| * *fsdp_kwargs* -- | |||||
| * *set_grad_to_none* -- 是否在训练过程中在每一次 optimizer 更新后将 grad 置为 ``None`` | |||||
| * *non_blocking* -- 表示用于 :meth:`torch.Tensor.to` 方法的参数 non_blocking | |||||
| * *gradscaler_kwargs* -- 用于 ``fp16=True`` 时,提供给 :class:`torch.amp.cuda.GradScaler` 的参数 | |||||
| :kwargs: | |||||
| * *wo_auto_param_call* (``bool``) -- 是否关闭在训练时调用我们的 ``auto_param_call`` 函数来自动匹配 batch 和前向函数的参数的行为 | |||||
| .. note:: | |||||
| 关于该参数的详细说明,请参见 :class:`~fastNLP.core.controllers.Trainer` 中的描述;函数 ``auto_param_call`` 详见 :func:`fastNLP.core.utils.auto_param_call`。 | |||||
| """ | """ | ||||
| def __init__( | def __init__( | ||||
| @@ -182,6 +200,14 @@ class TorchFSDPDriver(TorchDDPDriver): | |||||
| return _module | return _module | ||||
| def save_model(self, filepath: Union[str, Path], only_state_dict: bool = True, **kwargs): | def save_model(self, filepath: Union[str, Path], only_state_dict: bool = True, **kwargs): | ||||
| """ | |||||
| 保存的模型到 ``filepath`` 中。 | |||||
| :param filepath: 文件路径 | |||||
| :param only_state_dict: 是否只保存权重;在 ``TorchFSDPDriver`` 中只能为 ``True`` 。 | |||||
| :param kwargs: | |||||
| :return: | |||||
| """ | |||||
| filepath = Path(filepath) | filepath = Path(filepath) | ||||
| prefix = filepath.parent | prefix = filepath.parent | ||||
| filename = filepath.name | filename = filepath.name | ||||
| @@ -205,6 +231,14 @@ class TorchFSDPDriver(TorchDDPDriver): | |||||
| raise RuntimeError("When using `TorchFSDPDriver`, only `only_state_dict=True` is allowed.") | raise RuntimeError("When using `TorchFSDPDriver`, only `only_state_dict=True` is allowed.") | ||||
| def load_model(self, filepath: Union[Path, str], only_state_dict: bool = True, **kwargs): | def load_model(self, filepath: Union[Path, str], only_state_dict: bool = True, **kwargs): | ||||
| """ | |||||
| 从 ``filepath`` 中加载权重并赋值到当前 driver 的模型上。 | |||||
| :param filepath: 加载权重或模型的路径 | |||||
| :param load_state_dict: 保存的内容是否只是权重;在 ``TorchFSDPDriver`` 中只能为 ``True`` 。 | |||||
| :param kwargs: | |||||
| :return: | |||||
| """ | |||||
| if only_state_dict is False: | if only_state_dict is False: | ||||
| raise RuntimeError("When using `TorchFSDPDriver`, only `only_state_dict=True` is allowed.") | raise RuntimeError("When using `TorchFSDPDriver`, only `only_state_dict=True` is allowed.") | ||||
| filepath = Path(filepath) | filepath = Path(filepath) | ||||