From: @zhao_ting_v Reviewed-by: @c_34,@wuxuejian Signed-off-by: @c_34tags/v1.1.0
| @@ -91,6 +91,12 @@ You can start training using python or shell scripts. The usage of shell scripts | |||||
| - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | ||||
| - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | ||||
| > RANK_TABLE_FILE is HCCL configuration file when running on Ascend. | |||||
| > The common restrictions on using the distributed service are as follows. For details, see the HCCL documentation. | |||||
| > | |||||
| > - In a single-node system, a cluster of 1, 2, 4, or 8 devices is supported. In a multi-node system, a cluster of 8 x N devices is supported. | |||||
| > - Each host has four devices numbered 0 to 3 and four devices numbered 4 to 7 deployed on two different networks. During training of 2 or 4 devices, the devices must be connected and clusters cannot be created across networks. | |||||
| ### Launch | ### Launch | ||||
| ```shell | ```shell | ||||
| @@ -100,6 +100,12 @@ MobileNetV2总体网络架构如下: | |||||
| - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | ||||
| - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | - CPU: sh run_trian.sh CPU [DATASET_PATH] [CKPT_PATH] [FREEZE_LAYER] | ||||
| > RANK_TABLE_FILE 是在Ascned上运行分布式任务时HCCL的配置文件 | |||||
| > 我们列出使用分布式服务常见的使用限制,详细的可以查看HCCL对应的使用文档。 | |||||
| > | |||||
| > - 单机场景下支持1、2、4、8卡设备集群,多机场景下支持8*n卡设备集群。 | |||||
| > - 每台机器的0-3卡和4-7卡各为1个组网,2卡和4卡训练时卡必须相连且不支持跨组网创建集群。 | |||||
| ### 启动 | ### 启动 | ||||
| ```shell | ```shell | ||||
| @@ -29,6 +29,13 @@ run_ascend() | |||||
| fi | fi | ||||
| BASEPATH=$(cd "`dirname $0`" || exit; pwd) | BASEPATH=$(cd "`dirname $0`" || exit; pwd) | ||||
| VISIABLE_DEVICES=$3 | |||||
| IFS="," read -r -a CANDIDATE_DEVICE <<< "$VISIABLE_DEVICES" | |||||
| if [ ${#CANDIDATE_DEVICE[@]} -ne $2 ] | |||||
| then | |||||
| echo "error: DEVICE_NUM=$2 is not equal to the length of VISIABLE_DEVICES=$3" | |||||
| exit 1 | |||||
| fi | |||||
| export PYTHONPATH=${BASEPATH}:$PYTHONPATH | export PYTHONPATH=${BASEPATH}:$PYTHONPATH | ||||
| export RANK_TABLE_FILE=$4 | export RANK_TABLE_FILE=$4 | ||||
| export RANK_SIZE=$2 | export RANK_SIZE=$2 | ||||
| @@ -40,7 +47,7 @@ run_ascend() | |||||
| cd ../train || exit | cd ../train || exit | ||||
| for((i=0; i<${RANK_SIZE}; i++)) | for((i=0; i<${RANK_SIZE}; i++)) | ||||
| do | do | ||||
| export DEVICE_ID=$i | |||||
| export DEVICE_ID=${CANDIDATE_DEVICE[i]} | |||||
| export RANK_ID=$i | export RANK_ID=$i | ||||
| rm -rf ./rank$i | rm -rf ./rank$i | ||||
| mkdir ./rank$i | mkdir ./rank$i | ||||
| @@ -16,26 +16,6 @@ | |||||
| import argparse | import argparse | ||||
| import ast | import ast | ||||
| def launch_parse_args(): | |||||
| launch_parser = argparse.ArgumentParser(description="mindspore distributed training launch helper utilty \ | |||||
| that will spawn up multiple distributed processes") | |||||
| launch_parser.add_argument('--platform', type=str, default="Ascend", choices=("Ascend", "GPU", "CPU"), \ | |||||
| help='run platform, only support GPU, CPU and Ascend') | |||||
| launch_parser.add_argument("--nproc_per_node", type=int, default=1, choices=(1, 2, 3, 4, 5, 6, 7, 8), \ | |||||
| help="The number of processes to launch on each node, for D training, this is recommended to be set \ | |||||
| to the number of D in your system so that each process can be bound to a single D.") | |||||
| launch_parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", help="will use the \ | |||||
| visible devices sequentially") | |||||
| launch_parser.add_argument("--training_script", type=str, default="./train.py", help="The full path to \ | |||||
| the single D training program/script to be launched in parallel, followed by all the arguments for \ | |||||
| the training script") | |||||
| launch_args, unknown = launch_parser.parse_known_args() | |||||
| launch_args.training_script_args = unknown | |||||
| launch_args.training_script_args += ["--platform", launch_args.platform] | |||||
| return launch_args | |||||
| def train_parse_args(): | def train_parse_args(): | ||||
| train_parser = argparse.ArgumentParser(description='Image classification trian') | train_parser = argparse.ArgumentParser(description='Image classification trian') | ||||
| train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ | train_parser.add_argument('--platform', type=str, default="Ascend", choices=("CPU", "GPU", "Ascend"), \ | ||||
| @@ -48,6 +28,8 @@ def train_parse_args(): | |||||
| train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') | train_parser.add_argument('--run_distribute', type=ast.literal_eval, default=True, help='Run distribute') | ||||
| train_args = train_parser.parse_args() | train_args = train_parser.parse_args() | ||||
| train_args.is_training = True | train_args.is_training = True | ||||
| if train_args.platform == "CPU": | |||||
| train_args.run_distribute = False | |||||
| return train_args | return train_args | ||||
| def eval_parse_args(): | def eval_parse_args(): | ||||
| @@ -40,6 +40,7 @@ def set_config(args): | |||||
| "keep_checkpoint_max": 20, | "keep_checkpoint_max": 20, | ||||
| "save_checkpoint_path": "./", | "save_checkpoint_path": "./", | ||||
| "platform": args.platform, | "platform": args.platform, | ||||
| "run_distribute": args.run_distribute, | |||||
| "activation": "Softmax", | "activation": "Softmax", | ||||
| "export_format": "MINDIR", | "export_format": "MINDIR", | ||||
| "export_file": "mobilenetv2" | "export_file": "mobilenetv2" | ||||
| @@ -331,7 +331,7 @@ class MobileNetV2Combine(nn.Cell): | |||||
| Tensor, output tensor. | Tensor, output tensor. | ||||
| Examples: | Examples: | ||||
| >>> MobileNetV2(num_classes=1000) | |||||
| >>> MobileNetV2Combine(backbone, head) | |||||
| """ | """ | ||||
| def __init__(self, backbone, head): | def __init__(self, backbone, head): | ||||
| @@ -114,6 +114,13 @@ def load_ckpt(network, pretrain_ckpt_path, trainable=True): | |||||
| incremental_learning or not | incremental_learning or not | ||||
| """ | """ | ||||
| param_dict = load_checkpoint(pretrain_ckpt_path) | param_dict = load_checkpoint(pretrain_ckpt_path) | ||||
| if hasattr(network, "head"): | |||||
| head_param = network.head.parameters_dict() | |||||
| for k, v in head_param.items(): | |||||
| if param_dict[k].shape != v.shape: | |||||
| param_dict.pop(k) | |||||
| param_dict.pop(f"moments.{k}") | |||||
| print(f"Filter {k} don't load weights from checkpoint.") | |||||
| load_param_into_net(network, param_dict) | load_param_into_net(network, param_dict) | ||||
| if not trainable: | if not trainable: | ||||
| for param in network.get_parameters(): | for param in network.get_parameters(): | ||||
| @@ -53,21 +53,14 @@ if __name__ == '__main__': | |||||
| # define network | # define network | ||||
| backbone_net, head_net, net = define_net(config, args_opt.is_training) | backbone_net, head_net, net = define_net(config, args_opt.is_training) | ||||
| if args_opt.pretrain_ckpt != "" and args_opt.freeze_layer == "backbone": | |||||
| load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) | |||||
| step_size = extract_features(backbone_net, args_opt.dataset_path, config) | |||||
| else: | |||||
| if args_opt.platform == "CPU": | |||||
| raise ValueError("CPU only support fine tune the head net, doesn't support fine tune the all net") | |||||
| if args_opt.pretrain_ckpt: | |||||
| load_ckpt(backbone_net, args_opt.pretrain_ckpt) | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) | |||||
| step_size = dataset.get_dataset_size() | |||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config) | |||||
| step_size = dataset.get_dataset_size() | |||||
| if args_opt.pretrain_ckpt: | |||||
| if args_opt.freeze_layer == "backbone": | |||||
| load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False) | |||||
| step_size = extract_features(backbone_net, args_opt.dataset_path, config) | |||||
| else: | |||||
| load_ckpt(net, args_opt.pretrain_ckpt) | |||||
| if step_size == 0: | if step_size == 0: | ||||
| raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \ | raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \ | ||||
| than batch_size in config.py") | than batch_size in config.py") | ||||
| @@ -93,7 +86,7 @@ if __name__ == '__main__': | |||||
| total_epochs=epoch_size, | total_epochs=epoch_size, | ||||
| steps_per_epoch=step_size)) | steps_per_epoch=step_size)) | ||||
| if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer == "none": | |||||
| if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone": | |||||
| loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | ||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \ | opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \ | ||||
| config.weight_decay, config.loss_scale) | config.weight_decay, config.loss_scale) | ||||