From: @zhanghuiyao Reviewed-by: @c_34,@wuxuejian Signed-off-by: @c_34tags/v1.3.0
| @@ -106,13 +106,13 @@ After installing MindSpore via the official website, you can start training and | |||
| ```python | |||
| # run training example | |||
| python train.py --data_path=[DATA_PATH] --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] > output.train.log 2>&1 & | |||
| python train.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --dataset=[DATASET_TYPE] > output.train.log 2>&1 & | |||
| # run distributed training example | |||
| sh run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| sh scripts/run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| # run evaluation example | |||
| python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 & | |||
| python eval.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 & | |||
| ``` | |||
| For distributed training, a hccl configuration file with JSON format needs to be created in advance. | |||
| @@ -123,13 +123,118 @@ Please follow the instructions in the link below: | |||
| ```bash | |||
| # run training example | |||
| python train.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] > output.train.log 2>&1 & | |||
| python train.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] > output.train.log 2>&1 & | |||
| # run distributed training example | |||
| sh run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| sh scripts/run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| # run evaluation example | |||
| python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & | |||
| python eval.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & | |||
| ``` | |||
| - Running on [ModelArts](https://support.huaweicloud.com/modelarts/) | |||
| ```bash | |||
| # Train Cifar10 1p on ModelArts | |||
| # (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface. | |||
| # (2) Perform a or b. | |||
| # a. Set "enable_modelarts=True" on cifar10_config.yaml file. | |||
| # Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file. | |||
| # Set "is_distributed=0" on cifar10_config.yaml file. | |||
| # Set "dataset='cifar10'" on cifar10_config.yaml file. | |||
| # Set other parameters on cifar10_config.yaml file you need. | |||
| # b. Add "enable_modelarts=True" on the website UI interface. | |||
| # Add "data_dir=/cache/data/cifar10" on the website UI interface. | |||
| # Add "is_distributed=0" on the website UI interface. | |||
| # Add "dataset=cifar10" on the website UI interface. | |||
| # Add other parameters on the website UI interface. | |||
| # (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) | |||
| # (4) Set the code directory to "/path/vgg16" on the website UI interface. | |||
| # (5) Set the startup file to "train.py" on the website UI interface. | |||
| # (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. | |||
| # (7) Create your job. | |||
| # | |||
| # Train Cifar10 8p on ModelArts | |||
| # (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface. | |||
| # (2) Perform a or b. | |||
| # a. Set "enable_modelarts=True" on cifar10_config.yaml file. | |||
| # Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file. | |||
| # Set "is_distributed=1" on cifar10_config.yaml file. | |||
| # Set "dataset='cifar10'" on cifar10_config.yaml file. | |||
| # Set other parameters on cifar10_config.yaml file you need. | |||
| # b. Add "enable_modelarts=True" on the website UI interface. | |||
| # Add "data_dir=/cache/data/cifar10" on the website UI interface. | |||
| # Add "is_distributed=1" on the website UI interface. | |||
| # Add "dataset=cifar10" on the website UI interface. | |||
| # Add other parameters on the website UI interface. | |||
| # (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) | |||
| # (4) Set the code directory to "/path/vgg16" on the website UI interface. | |||
| # (5) Set the startup file to "train.py" on the website UI interface. | |||
| # (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. | |||
| # (7) Create your job. | |||
| # | |||
| # Train Imagenet 8p on ModelArts | |||
| # (1) Add "config_path=/path_to_code/imagenet2012_config.yaml" on the website UI interface. | |||
| # (2) Perform a or b. | |||
| # a. Set "enable_modelarts=True" on imagenet2012_config.yaml file. | |||
| # Set "data_dir='/cache/data/ImageNet/train'" on imagenet2012_config.yaml file. | |||
| # Set "is_distributed=1" on imagenet2012_config.yaml file. | |||
| # Set "dataset='imagenet2012'" on imagenet2012_config.yaml file. | |||
| # Set other parameters on imagenet2012_config.yaml file you need. | |||
| # b. Add "enable_modelarts=True" on the website UI interface. | |||
| # Add "data_dir=/cache/data/ImageNet/train" on the website UI interface. | |||
| # Add "is_distributed=1" on the website UI interface. | |||
| # Add "dataset=imagenet2012" on the website UI interface. | |||
| # Add other parameters on the website UI interface. | |||
| # (3) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) | |||
| # (4) Set the code directory to "/path/vgg16" on the website UI interface. | |||
| # (5) Set the startup file to "train.py" on the website UI interface. | |||
| # (6) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. | |||
| # (7) Create your job. | |||
| # | |||
| # Eval Cifar10 1p on ModelArts | |||
| # (1) Add "config_path=/path_to_code/cifar10_config.yaml" on the website UI interface. | |||
| # (2) Perform a or b. | |||
| # a. Set "enable_modelarts=True" on cifar10_config.yaml file. | |||
| # Set "data_dir='/cache/data/cifar10'" on cifar10_config.yaml file. | |||
| # Set "dataset='cifar10'" on cifar10_config.yaml file. | |||
| # Set "checkpoint_url='s3://dir_to_your_trained_model/'" on cifar10_config.yaml file. | |||
| # Set "pre_trained='/cache/checkpoint_path/model.ckpt'" on cifar10_config.yaml file. | |||
| # Set other parameters on cifar10_config.yaml file you need. | |||
| # b. Add "enable_modelarts=True" on the website UI interface. | |||
| # Add "data_dir=/cache/data/cifar10" on the website UI interface. | |||
| # Add "dataset=cifar10" on the website UI interface. | |||
| # Add "checkpoint_url=s3://dir_to_your_trained_model/" on the website UI interface. | |||
| # Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface. | |||
| # Add other parameters on the website UI interface. | |||
| # (3) Upload or copy your pretrained model to S3 bucket. | |||
| # (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) | |||
| # (5) Set the code directory to "/path/vgg16" on the website UI interface. | |||
| # (6) Set the startup file to "eval.py" on the website UI interface. | |||
| # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. | |||
| # (8) Create your job. | |||
| # | |||
| # Eval ImageNet 1p on ModelArts | |||
| # (1) Add "config_path=/path_to_code/imagenet2012_config.yaml" on the website UI interface. | |||
| # (2) Perform a or b. | |||
| # a. Set "enable_modelarts=True" on imagenet2012_config.yaml file. | |||
| # Set "data_dir='/cache/data/ImageNet/validation_preprocess'" on imagenet2012_config.yaml file. | |||
| # Set "dataset='imagenet2012'" on imagenet2012_config.yaml file. | |||
| # Set "checkpoint_url='s3://dir_to_your_trained_model/'" on imagenet2012_config.yaml file. | |||
| # Set "pre_trained='/cache/checkpoint_path/model.ckpt'" on imagenet2012_config.yaml file. | |||
| # Set other parameters on imagenet2012_config.yaml file you need. | |||
| # b. Add "enable_modelarts=True" on the website UI interface. | |||
| # Add "data_dir=/cache/data/ImageNet/validation_preprocess" on the website UI interface. | |||
| # Add "dataset=imagenet2012" on the website UI interface. | |||
| # Add "checkpoint_url=s3://dir_to_your_trained_model/" on the website UI interface. | |||
| # Add "pre_trained=/cache/checkpoint_path/model.ckpt" on the website UI interface. | |||
| # Add other parameters on the website UI interface. | |||
| # (3) Upload or copy your pretrained model to S3 bucket. | |||
| # (4) Upload a zip dataset to S3 bucket. (you could also upload the origin dataset, but it can be so slow.) | |||
| # (5) Set the code directory to "/path/vgg16" on the website UI interface. | |||
| # (6) Set the startup file to "eval.py" on the website UI interface. | |||
| # (7) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface. | |||
| # (8) Create your job. | |||
| ``` | |||
| ## [Script Description](#contents) | |||
| @@ -140,17 +245,25 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| ├── model_zoo | |||
| ├── README.md // descriptions about all the models | |||
| ├── vgg16 | |||
| ├── README.md // descriptions about googlenet | |||
| ├── README.md // descriptions about vgg | |||
| ├── README_CN.md // descriptions about vgg with Chinese | |||
| ├── model_utils | |||
| │ ├── __init__.py // init file | |||
| │ ├── config.py // Parse arguments | |||
| │ ├── device_adapter.py // Device adapter for ModelArts | |||
| │ ├── local_adapter.py // Local adapter | |||
| │ ├── moxing_adapter.py // Moxing adapter for ModelArts | |||
| ├── scripts | |||
| │ ├── run_distribute_train.sh // shell script for distributed training on Ascend | |||
| │ ├── run_distribute_train_gpu.sh // shell script for distributed training on GPU | |||
| │ ├── run_eval.sh // shell script for eval on Ascend | |||
| │ ├── run_infer_310.sh // shell script for infer on Ascend 310 | |||
| ├── src | |||
| │ ├── utils | |||
| │ │ ├── logging.py // logging format setting | |||
| │ │ ├── sampler.py // create sampler for dataset | |||
| │ │ ├── util.py // util function | |||
| │ │ ├── var_init.py // network parameter init method | |||
| │ ├── config.py // parameter configuration | |||
| │ ├── crossentropy.py // loss calculation | |||
| │ ├── dataset.py // creating dataset | |||
| │ ├── linear_warmup.py // linear leanring rate | |||
| @@ -159,6 +272,11 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| │ ├──vgg.py // vgg architecture | |||
| ├── train.py // training script | |||
| ├── eval.py // evaluation script | |||
| ├── postprocess.py // postprocess script | |||
| ├── preprocess.py // preprocess script | |||
| ├── mindspore_hub_conf.py // mindspore_hub_conf script | |||
| ├── cifar10_config.yaml // Configurations for cifar10 | |||
| ├── imagenet2012_config.yaml // Configurations for imagenet2012 | |||
| ``` | |||
| ### [Script Parameters](#contents) | |||
| @@ -166,17 +284,18 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| #### Training | |||
| ```bash | |||
| usage: train.py [--device_target TARGET][--data_path DATA_PATH] | |||
| usage: train.py [--config_path YAML_CONFIG_PATH] | |||
| [--device_target TARGET][--data_dir DATA_PATH] | |||
| [--dataset DATASET_TYPE][--is_distributed VALUE] | |||
| [--device_id DEVICE_ID][--pre_trained PRE_TRAINED] | |||
| [--pre_trained PRE_TRAINED] | |||
| [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] | |||
| parameters/options: | |||
| --config_path the storage path of YAML_CONFIG_FILE | |||
| --device_target the training backend type, Ascend or GPU, default is Ascend. | |||
| --dataset the dataset type, cifar10 or imagenet2012. | |||
| --is_distributed the way of traing, whether do distribute traing, value can be 0 or 1. | |||
| --data_path the storage path of dataset | |||
| --device_id the device which used to train model. | |||
| --data_dir the storage path of dataset | |||
| --pre_trained the pretrained checkpoint file path. | |||
| --ckpt_path the path to save checkpoint. | |||
| --ckpt_interval the epoch interval for saving checkpoint. | |||
| @@ -186,76 +305,76 @@ parameters/options: | |||
| #### Evaluation | |||
| ```bash | |||
| usage: eval.py [--device_target TARGET][--data_path DATA_PATH] | |||
| usage: eval.py [--config_path YAML_CONFIG_PATH] | |||
| [--device_target TARGET][--data_dir DATA_PATH] | |||
| [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] | |||
| [--device_id DEVICE_ID] | |||
| parameters/options: | |||
| --config_path the storage path of YAML_CONFIG_FILE | |||
| --device_target the evaluation backend type, Ascend or GPU, default is Ascend. | |||
| --dataset the dataset type, cifar10 or imagenet2012. | |||
| --data_path the storage path of dataset. | |||
| --device_id the device which used to evaluate model. | |||
| --data_dir the storage path of dataset. | |||
| --pre_trained the checkpoint file path used to evaluate model. | |||
| ``` | |||
| ### [Parameter configuration](#contents) | |||
| Parameters for both training and evaluation can be set in config.py. | |||
| Parameters for both training and evaluation can be set in cifar10_config.yaml/cifar10_config.yaml. | |||
| - config for vgg16, CIFAR-10 dataset | |||
| ```bash | |||
| "num_classes": 10, # dataset class num | |||
| "lr": 0.01, # learning rate | |||
| "lr_init": 0.01, # initial learning rate | |||
| "lr_max": 0.1, # max learning rate | |||
| "lr_epochs": '30,60,90,120', # lr changing based epochs | |||
| "lr_scheduler": "step", # learning rate mode | |||
| "warmup_epochs": 5, # number of warmup epoch | |||
| "batch_size": 64, # batch size of input tensor | |||
| "max_epoch": 70, # only valid for taining, which is always 1 for inference | |||
| "momentum": 0.9, # momentum | |||
| "weight_decay": 5e-4, # weight decay | |||
| "loss_scale": 1.0, # loss scale | |||
| "label_smooth": 0, # label smooth | |||
| "label_smooth_factor": 0, # label smooth factor | |||
| "buffer_size": 10, # shuffle buffer size | |||
| "image_size": '224,224', # image size | |||
| "pad_mode": 'same', # pad mode for conv2d | |||
| "padding": 0, # padding value for conv2d | |||
| "has_bias": False, # whether has bias in conv2d | |||
| "batch_norm": True, # whether has batch_norm in conv2d | |||
| "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint | |||
| "initialize_mode": "XavierUniform", # conv2d init mode | |||
| "has_dropout": True # whether using Dropout layer | |||
| num_classes: 10 # dataset class num | |||
| lr: 0.01 # learning rate | |||
| lr_init: 0.01 # initial learning rate | |||
| lr_max: 0.1 # max learning rate | |||
| lr_epochs: '30,60,90,120' # lr changing based epochs | |||
| lr_scheduler: "step" # learning rate mode | |||
| warmup_epochs: 5 # number of warmup epoch | |||
| batch_size: 64 # batch size of input tensor | |||
| max_epoch: 70 # only valid for taining, which is always 1 for inference | |||
| momentum: 0.9 # momentum | |||
| weight_decay: 0.0005 # weight decay | |||
| loss_scale: 1.0 # loss scale | |||
| label_smooth: 0 # label smooth | |||
| label_smooth_factor: 0 # label smooth factor | |||
| buffer_size: 10 # shuffle buffer size | |||
| image_size: '224,224' # image size | |||
| pad_mode: 'same' # pad mode for conv2d | |||
| padding: 0 # padding value for conv2d | |||
| has_bias: False # whether has bias in conv2d | |||
| batch_norm: True # whether has batch_norm in conv2d | |||
| keep_checkpoint_max: 10 # only keep the last keep_checkpoint_max checkpoint | |||
| initialize_mode: "XavierUniform" # conv2d init mode | |||
| has_dropout: True # whether using Dropout layer | |||
| ``` | |||
| - config for vgg16, ImageNet2012 dataset | |||
| ```bash | |||
| "num_classes": 1000, # dataset class num | |||
| "lr": 0.01, # learning rate | |||
| "lr_init": 0.01, # initial learning rate | |||
| "lr_max": 0.1, # max learning rate | |||
| "lr_epochs": '30,60,90,120', # lr changing based epochs | |||
| "lr_scheduler": "cosine_annealing", # learning rate mode | |||
| "warmup_epochs": 0, # number of warmup epoch | |||
| "batch_size": 32, # batch size of input tensor | |||
| "max_epoch": 150, # only valid for taining, which is always 1 for inference | |||
| "momentum": 0.9, # momentum | |||
| "weight_decay": 1e-4, # weight decay | |||
| "loss_scale": 1024, # loss scale | |||
| "label_smooth": 1, # label smooth | |||
| "label_smooth_factor": 0.1, # label smooth factor | |||
| "buffer_size": 10, # shuffle buffer size | |||
| "image_size": '224,224', # image size | |||
| "pad_mode": 'pad', # pad mode for conv2d | |||
| "padding": 1, # padding value for conv2d | |||
| "has_bias": True, # whether has bias in conv2d | |||
| "batch_norm": False, # whether has batch_norm in conv2d | |||
| "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint | |||
| "initialize_mode": "KaimingNormal", # conv2d init mode | |||
| "has_dropout": True # whether using Dropout layer | |||
| num_classes: 1000 # dataset class num | |||
| lr: 0.01 # learning rate | |||
| lr_init: 0.01 # initial learning rate | |||
| lr_max: 0.1 # max learning rate | |||
| lr_epochs: '30,60,90,120' # lr changing based epochs | |||
| lr_scheduler: "cosine_annealing" # learning rate mode | |||
| warmup_epochs: 0 # number of warmup epoch | |||
| batch_size: 32 # batch size of input tensor | |||
| max_epoch: 150 # only valid for taining, which is always 1 for inference | |||
| momentum: 0.9 # momentum | |||
| weight_decay: 0.0001 # weight decay | |||
| loss_scale: 1024 # loss scale | |||
| label_smooth: 1 # label smooth | |||
| label_smooth_factor: 0.1 # label smooth factor | |||
| buffer_size: 10 # shuffle buffer size | |||
| image_size: '224,224' # image size | |||
| pad_mode: 'pad' # pad mode for conv2d | |||
| padding: 1 # padding value for conv2d | |||
| has_bias: True # whether has bias in conv2d | |||
| batch_norm: False # whether has batch_norm in conv2d | |||
| keep_checkpoint_max: 10 # only keep the last keep_checkpoint_max checkpoint | |||
| initialize_mode: "KaimingNormal" # conv2d init mode | |||
| has_dropout: True # whether using Dropout layer | |||
| ``` | |||
| ### [Training Process](#contents) | |||
| @@ -267,7 +386,7 @@ Parameters for both training and evaluation can be set in config.py. | |||
| - Training using single device(1p), using CIFAR-10 dataset in default | |||
| ```bash | |||
| python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & | |||
| python train.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path > out.train.log 2>&1 & | |||
| ``` | |||
| The python command above will run in the background, you can view the results through the file `out.train.log`. | |||
| @@ -312,7 +431,7 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579 | |||
| - Training using single device(1p) | |||
| ```bash | |||
| python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 & | |||
| python train.py --config_path=/dir_to_code/imagenet2012_config.yaml --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_dir=$DATA_PATH > output.train.log 2>&1 & | |||
| ``` | |||
| - Distributed Training | |||
| @@ -330,10 +449,10 @@ bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train" | |||
| ```bash | |||
| # when using cifar10 dataset | |||
| python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 & | |||
| python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 & | |||
| # when using imagenet2012 dataset | |||
| python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 & | |||
| python eval.py --config_path=/dir_to_code/imagenet2012.yaml --data_dir=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 & | |||
| ``` | |||
| - The above python command will run in the background, you can view the results through the file `output.eval.log`. You will get the accuracy as following: | |||
| @@ -353,7 +472,7 @@ after allreduce eval: top5_correct=45582, tot=50000, acc=91.16% | |||
| ### [Export MindIR](#contents) | |||
| ```shell | |||
| python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] | |||
| python export.py --config_path [YMAL_CONFIG_PATH] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] | |||
| ``` | |||
| The ckpt_file parameter is required, | |||
| @@ -109,13 +109,13 @@ VGG 16网络主要由几个基本模块(包括卷积层和池化层)和三 | |||
| ```python | |||
| # 训练示例 | |||
| python train.py --data_path=[DATA_PATH] --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] > output.train.log 2>&1 & | |||
| python train.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --dataset=[DATASET_TYPE] > output.train.log 2>&1 & | |||
| # 分布式训练示例 | |||
| sh run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| sh scripts/run_distribute_train.sh [RANL_TABLE_JSON] [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| # 评估示例 | |||
| python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 & | |||
| python eval.py --config_path=[YAML_CONFIG_PATH] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[DATASET_TYPE] > output.eval.log 2>&1 & | |||
| ``` | |||
| 分布式训练需要提前创建JSON格式的HCCL配置文件。 | |||
| @@ -126,13 +126,118 @@ python eval.py --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] --dataset=[D | |||
| ```python | |||
| # 训练示例 | |||
| python train.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] > output.train.log 2>&1 & | |||
| python train.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] > output.train.log 2>&1 & | |||
| # 分布式训练示例 | |||
| sh run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| sh scripts/run_distribute_train_gpu.sh [DATA_PATH] --dataset=[DATASET_TYPE] | |||
| # 评估示例 | |||
| python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_TYPE] --data_path=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & | |||
| python eval.py --config_path=[YAML_CONFIG_PATH] --device_target="GPU" --dataset=[DATASET_TYPE] --data_dir=[DATA_PATH] --pre_trained=[PRE_TRAINED] > output.eval.log 2>&1 & | |||
| ``` | |||
| - 在 ModelArts 进行训练 (如果你想在modelarts上运行,可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/)) | |||
| ```bash | |||
| # 在 ModelArts 上使用 单卡训练 cifar10 数据集 | |||
| # (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml" | |||
| # (2) 执行a或者b | |||
| # a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True" | |||
| # 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 "is_distributed=0" | |||
| # 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 其他参数 | |||
| # b. 在网页上设置 "enable_modelarts=True" | |||
| # 在网页上设置 "data_dir=/cache/data/cifar10" | |||
| # 在网页上设置 "is_distributed=0" | |||
| # 在网页上设置 "dataset=cifar10" | |||
| # 在网页上设置 其他参数 | |||
| # (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。) | |||
| # (4) 在网页上设置你的代码路径为 "/path/vgg16" | |||
| # (5) 在网页上设置启动文件为 "train.py" | |||
| # (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 | |||
| # (7) 创建训练作业 | |||
| # | |||
| # 在 ModelArts 上使用8卡训练 cifar10 数据集 | |||
| # (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml" | |||
| # (2) 执行a或者b | |||
| # a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True" | |||
| # 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 "is_distributed=1" | |||
| # 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 其他参数 | |||
| # b. 在网页上设置 "enable_modelarts=True" | |||
| # 在网页上设置 "data_dir=/cache/data/cifar10" | |||
| # 在网页上设置 "is_distributed=1" | |||
| # 在网页上设置 "dataset=cifar10" | |||
| # 在网页上设置 其他参数 | |||
| # (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。) | |||
| # (4) 在网页上设置你的代码路径为 "/path/vgg16" | |||
| # (5) 在网页上设置启动文件为 "train.py" | |||
| # (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 | |||
| # (7) 创建训练作业 | |||
| # | |||
| # 在 ModelArts 上使用8卡训练 ImageNet 数据集 | |||
| # (1) 在网页上设置 "config_path=/path_to_code/imagenet2012_config.yaml" | |||
| # (2) 执行a或者b | |||
| # a. 在 imagenet2012_config.yaml 文件中设置 "enable_modelarts=True" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "data_dir='/cache/data/ImageNet/train'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "is_distributed=1" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "dataset='imagenet2012'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 其他参数 | |||
| # b. 在网页上设置 "enable_modelarts=True" | |||
| # 在网页上设置 "data_dir=/cache/data/ImageNet/train" | |||
| # 在网页上设置 "is_distributed=1" | |||
| # 在网页上设置 "dataset=imagenet2012" | |||
| # 在网页上设置 其他参数 | |||
| # (3) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。) | |||
| # (4) 在网页上设置你的代码路径为 "/path/vgg16" | |||
| # (5) 在网页上设置启动文件为 "train.py" | |||
| # (6) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 | |||
| # (7) 创建训练作业 | |||
| # | |||
| # 在 ModelArts 上使用 单卡验证 Cifar10 数据集 | |||
| # (1) 在网页上设置 "config_path=/path_to_code/cifar10_config.yaml" | |||
| # (2) 执行a或者b | |||
| # a. 在 cifar10_config.yaml 文件中设置 "enable_modelarts=True" | |||
| # 在 cifar10_config.yaml 文件中设置 "data_dir='/cache/data/cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 "dataset='cifar10'" | |||
| # 在 cifar10_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'" | |||
| # 在 cifar10_config.yaml 文件中设置 "pre_trained='/cache/checkpoint_path/model.ckpt'" | |||
| # 在 cifar10_config.yaml 文件中设置 其他参数 | |||
| # b. 在网页上设置 "enable_modelarts=True" | |||
| # 在网页上设置 "data_dir=/cache/data/cifar10" | |||
| # 在网页上设置 "dataset=cifar10" | |||
| # 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_model/" | |||
| # 在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt" | |||
| # 在网页上设置 其他参数 | |||
| # (3) 上传你的预训练模型到 S3 桶上 | |||
| # (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。) | |||
| # (5) 在网页上设置你的代码路径为 "/path/vgg16" | |||
| # (6) 在网页上设置启动文件为 "eval.py" | |||
| # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 | |||
| # (8) 创建训练作业 | |||
| # | |||
| # 在 ModelArts 上使用 单卡验证 ImageNet 数据集 | |||
| # (1) 在网页上设置 "config_path=/path_to_code/imagenet2012_config.yaml" | |||
| # (2) 执行a或者b | |||
| # a. 在 imagenet2012_config.yaml 文件中设置 "enable_modelarts=True" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "data_dir='/cache/data/ImageNet/validation_preprocess'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "dataset='imagenet2012'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "checkpoint_url='s3://dir_to_your_trained_model/'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 "pre_trained='/cache/checkpoint_path/model.ckpt'" | |||
| # 在 imagenet2012_config.yaml 文件中设置 其他参数 | |||
| # b. 在网页上设置 "enable_modelarts=True" | |||
| # 在网页上设置 "data_dir=/cache/data/ImageNet/validation_preprocess" | |||
| # 在网页上设置 "dataset=imagenet2012" | |||
| # 在网页上设置 "checkpoint_url=s3://dir_to_your_trained_model/" | |||
| # 在网页上设置 "pre_trained=/cache/checkpoint_path/model.ckpt" | |||
| # 在网页上设置 其他参数 | |||
| # (3) 上传你的预训练模型到 S3 桶上 | |||
| # (4) 上传你的压缩数据集到 S3 桶上 (你也可以上传原始的数据集,但那可能会很慢。) | |||
| # (5) 在网页上设置你的代码路径为 "/path/vgg16" | |||
| # (6) 在网页上设置启动文件为 "eval.py" | |||
| # (7) 在网页上设置"训练数据集"、"训练输出文件路径"、"作业日志路径"等 | |||
| # (8) 创建训练作业 | |||
| ``` | |||
| ## 脚本说明 | |||
| @@ -143,17 +248,25 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| ├── model_zoo | |||
| ├── README.md // 所有模型相关说明 | |||
| ├── vgg16 | |||
| ├── README.md // GoogLeNet相关说明 | |||
| ├── README.md // VGG 相关说明 | |||
| ├── README_CN.md // VGG 相关中文说明 | |||
| ├── model_utils | |||
| ├── __init__.py // 初始化文件 | |||
| ├── config.py // 参数配置 | |||
| ├── device_adapter.py // ModelArts的设备适配器 | |||
| ├── local_adapter.py // 本地适配器 | |||
| └── moxing_adapter.py // ModelArts的模型适配器 | |||
| ├── scripts | |||
| │ ├── run_distribute_train.sh // Ascend分布式训练shell脚本 | |||
| │ ├── run_distribute_train_gpu.sh // GPU分布式训练shell脚本 | |||
| │ ├── run_distribute_train.sh // Ascend 分布式训练shell脚本 | |||
| │ ├── run_distribute_train_gpu.sh // GPU 分布式训练shell脚本 | |||
| │ ├── run_eval.sh // Ascend 验证shell脚本 | |||
| │ ├── run_infer_310.sh // Ascend310 推理shell脚本 | |||
| ├── src | |||
| │ ├── utils | |||
| │ │ ├── logging.py // 日志格式设置 | |||
| │ │ ├── sampler.py // 为数据集创建采样器 | |||
| │ │ ├── util.py // 工具函数 | |||
| │ │ ├── var_init.py // 网络参数init方法 | |||
| │ ├── config.py // 参数配置 | |||
| │ ├── crossentropy.py // 损失计算 | |||
| │ ├── dataset.py // 创建数据集 | |||
| │ ├── linear_warmup.py // 线性学习率 | |||
| @@ -162,6 +275,11 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| │ ├──vgg.py // VGG架构 | |||
| ├── train.py // 训练脚本 | |||
| ├── eval.py // 评估脚本 | |||
| ├── postprocess.py // 后处理脚本 | |||
| ├── preprocess.py // 预处理脚本 | |||
| ├── mindspore_hub_conf.py // mindspore hub 脚本 | |||
| ├── cifar10_config.yaml // cifar10 配置文件 | |||
| ├── imagenet2012_config.yaml // imagenet2012 配置文件 | |||
| ``` | |||
| ### 脚本参数 | |||
| @@ -169,17 +287,18 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| #### 训练 | |||
| ```bash | |||
| 用法:train.py [--device_target TARGET][--data_path DATA_PATH] | |||
| [--dataset DATASET_TYPE][--is_distributed VALUE] | |||
| [--device_id DEVICE_ID][--pre_trained PRE_TRAINED] | |||
| [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] | |||
| 用法:train.py [--config_path YAML_CONFIG_PATH] | |||
| [--device_target TARGET][--data_dir DATA_PATH] | |||
| [--dataset DATASET_TYPE][--is_distributed VALUE] | |||
| [--pre_trained PRE_TRAINED] | |||
| [--ckpt_path CHECKPOINT_PATH][--ckpt_interval INTERVAL_STEP] | |||
| 选项: | |||
| --config_path yaml配置文件路径 | |||
| --device_target 训练后端类型,Ascend或GPU,默认为Ascend。 | |||
| --dataset 数据集类型,cifar10或imagenet2012。 | |||
| --is_distributed 训练方式,是否为分布式训练,值可以是0或1。 | |||
| --data_path 数据集存储路径 | |||
| --device_id 用于训练模型的设备。 | |||
| --data_dir 数据集存储路径 | |||
| --pre_trained 预训练检查点文件路径。 | |||
| --ckpt_path 存放检查点的路径。 | |||
| --ckpt_interval 保存检查点的轮次间隔。 | |||
| @@ -189,76 +308,76 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| #### 评估 | |||
| ```bash | |||
| 用法:eval.py [--device_target TARGET][--data_path DATA_PATH] | |||
| [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] | |||
| [--device_id DEVICE_ID] | |||
| 用法:eval.py [--config_path YAML_CONFIG_PATH] | |||
| [--device_target TARGET][--data_dir DATA_PATH] | |||
| [--dataset DATASET_TYPE][--pre_trained PRE_TRAINED] | |||
| 选项: | |||
| --config_path yaml配置文件路径 | |||
| --device_target 评估后端类型,Ascend或GPU,默认为Ascend。 | |||
| --dataset 数据集类型,cifar10或imagenet2012。 | |||
| --data_path 数据集存储路径。 | |||
| --device_id 用于评估模型的设备。 | |||
| --data_dir 数据集存储路径。 | |||
| --pre_trained 用于评估模型的检查点文件路径。 | |||
| ``` | |||
| ### 参数配置 | |||
| 在config.py中可以同时配置训练参数和评估参数。 | |||
| 在 cifar10_config.yaml/cifar10_config.yaml 中可以同时配置训练参数和评估参数。 | |||
| - 配置VGG16,CIFAR-10数据集 | |||
| ```bash | |||
| "num_classes": 10, # 数据集类数 | |||
| "lr": 0.01, # 学习率 | |||
| "lr_init": 0.01, # 初始学习率 | |||
| "lr_max": 0.1, # 最大学习率 | |||
| "lr_epochs": '30,60,90,120', # 基于变化lr的轮次 | |||
| "lr_scheduler": "step", # 学习率模式 | |||
| "warmup_epochs": 5, # 热身轮次数 | |||
| "batch_size": 64, # 输入张量批次大小 | |||
| "max_epoch": 70, # 只对训练有效,推理固定值为1 | |||
| "momentum": 0.9, # 动量 | |||
| "weight_decay": 5e-4, # 权重衰减 | |||
| "loss_scale": 1.0, # 损失放大 | |||
| "label_smooth": 0, # 标签平滑 | |||
| "label_smooth_factor": 0, # 标签平滑因子 | |||
| "buffer_size": 10, # 混洗缓冲区大小 | |||
| "image_size": '224,224', # 图像大小 | |||
| "pad_mode": 'same', # conv2d的填充方式 | |||
| "padding": 0, # conv2d的填充值 | |||
| "has_bias": False, # conv2d是否有偏差 | |||
| "batch_norm": True, # 在conv2d中是否有batch_norm | |||
| "keep_checkpoint_max": 10, # 只保留最后一个keep_checkpoint_max检查点 | |||
| "initialize_mode": "XavierUniform", # conv2d init模式 | |||
| "has_dropout": True # 是否使用Dropout层 | |||
| num_classes: 10 # 数据集类数 | |||
| lr: 0.01 # 学习率 | |||
| lr_init: 0.01 # 初始学习率 | |||
| lr_max: 0.1 # 最大学习率 | |||
| lr_epochs: '30,60,90,120' # 基于变化lr的轮次 | |||
| lr_scheduler: "step" # 学习率模式 | |||
| warmup_epochs: 5 # 热身轮次数 | |||
| batch_size: 64 # 输入张量批次大小 | |||
| max_epoch: 70 # 只对训练有效,推理固定值为1 | |||
| momentum: 0.9 # 动量 | |||
| weight_decay: 5e-4 # 权重衰减 | |||
| loss_scale: 1.0 # 损失放大 | |||
| label_smooth: 0 # 标签平滑 | |||
| label_smooth_factor: 0 # 标签平滑因子 | |||
| buffer_size: 10 # 混洗缓冲区大小 | |||
| image_size: '224,224' # 图像大小 | |||
| pad_mode: 'same' # conv2d的填充方式 | |||
| padding: 0 # conv2d的填充值 | |||
| has_bias: False # conv2d是否有偏差 | |||
| batch_norm: True # 在conv2d中是否有batch_norm | |||
| keep_checkpoint_max: 10 # 只保留最后一个keep_checkpoint_max检查点 | |||
| initialize_mode: "XavierUniform" # conv2d init模式 | |||
| has_dropout: True # 是否使用Dropout层 | |||
| ``` | |||
| - VGG16配置,ImageNet2012数据集 | |||
| ```bash | |||
| "num_classes": 1000, # 数据集类数 | |||
| "lr": 0.01, # 学习率 | |||
| "lr_init": 0.01, # 初始学习率 | |||
| "lr_max": 0.1, # 最大学习率 | |||
| "lr_epochs": '30,60,90,120', # 基于变化lr的轮次 | |||
| "lr_scheduler": "cosine_annealing", # 学习率模式 | |||
| "warmup_epochs": 0, # 热身轮次数 | |||
| "batch_size": 32, # 输入张量的批次大小 | |||
| "max_epoch": 150, # 只对训练有效,推理固定值为1 | |||
| "momentum": 0.9, # 动量 | |||
| "weight_decay": 1e-4, # 权重衰减 | |||
| "loss_scale": 1024, # 损失放大 | |||
| "label_smooth": 1, # 标签平滑 | |||
| "label_smooth_factor": 0.1, # 标签平滑因子 | |||
| "buffer_size": 10, # 混洗缓冲区大小 | |||
| "image_size": '224,224', # 图像大小 | |||
| "pad_mode": 'pad', # conv2d的填充方式 | |||
| "padding": 1, # conv2d的填充值 | |||
| "has_bias": True, # conv2d是否有偏差 | |||
| "batch_norm": False, # 在conv2d中是否有batch_norm | |||
| "keep_checkpoint_max": 10, # 只保留最后一个keep_checkpoint_max检查点 | |||
| "initialize_mode": "KaimingNormal", # conv2d init模式 | |||
| "has_dropout": True # 是否使用Dropout层 | |||
| num_classes: 1000 # 数据集类数 | |||
| lr: 0.01 # 学习率 | |||
| lr_init: 0.01 # 初始学习率 | |||
| lr_max: 0.1 # 最大学习率 | |||
| lr_epochs: '30,60,90,120' # 基于变化lr的轮次 | |||
| lr_scheduler: "cosine_annealing" # 学习率模式 | |||
| warmup_epochs: 0 # 热身轮次数 | |||
| batch_size: 32 # 输入张量的批次大小 | |||
| max_epoch: 150 # 只对训练有效,推理固定值为1 | |||
| momentum: 0.9 # 动量 | |||
| weight_decay: 1e-4 # 权重衰减 | |||
| loss_scale: 1024 # 损失放大 | |||
| label_smooth: 1 # 标签平滑 | |||
| label_smooth_factor: 0.1 # 标签平滑因子 | |||
| buffer_size: 10 # 混洗缓冲区大小 | |||
| image_size: '224,224' # 图像大小 | |||
| pad_mode: 'pad' # conv2d的填充方式 | |||
| padding: 1 # conv2d的填充值 | |||
| has_bias: True # conv2d是否有偏差 | |||
| batch_norm: False # 在conv2d中是否有batch_norm | |||
| keep_checkpoint_max: 10 # 只保留最后一个keep_checkpoint_max检查点 | |||
| initialize_mode: "KaimingNormal" # conv2d init模式 | |||
| has_dropout: True # 是否使用Dropout层 | |||
| ``` | |||
| ### 训练过程 | |||
| @@ -270,7 +389,7 @@ python eval.py --device_target="GPU" --device_id=[DEVICE_ID] --dataset=[DATASET_ | |||
| - 使用单设备(1p)训练,默认使用CIFAR-10数据集 | |||
| ```bash | |||
| python train.py --data_path=your_data_path --device_id=6 > out.train.log 2>&1 & | |||
| python train.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path > out.train.log 2>&1 & | |||
| ``` | |||
| 上述python命令在后台运行,可通过`out.train.log`文件查看结果。 | |||
| @@ -289,7 +408,7 @@ epcoh: 2 step: 781, loss is 1.827582 | |||
| - 分布式训练 | |||
| ```bash | |||
| sh run_distribute_train.sh rank_table.json your_data_path | |||
| sh scripts/run_distribute_train.sh rank_table.json your_data_path | |||
| ``` | |||
| 上述shell脚本会在后台进行分布式训练,可通过`train_parallel[X]/log`文件查看结果。 | |||
| @@ -316,7 +435,7 @@ train_parallel1/log:epcoh: 2 step: 97, loss is 1.7133579 | |||
| - 单设备训练(1p) | |||
| ```bash | |||
| python train.py --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_path=$DATA_PATH > output.train.log 2>&1 & | |||
| python train.py --config_path=/dir_to_code/imagenet2012_config.yaml --device_target="GPU" --dataset="imagenet2012" --is_distributed=0 --data_dir=$DATA_PATH > output.train.log 2>&1 & | |||
| ``` | |||
| - 分布式训练 | |||
| @@ -334,10 +453,10 @@ bash scripts/run_distribute_train_gpu.sh /path/ImageNet2012/train" | |||
| ```bash | |||
| # 使用CIFAR-10数据集 | |||
| python eval.py --data_path=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 & | |||
| python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="cifar10" --device_target="Ascend" --pre_trained=./*-70-781.ckpt > output.eval.log 2>&1 & | |||
| # 使用ImageNet2012数据集 | |||
| python eval.py --data_path=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 & | |||
| python eval.py --config_path=/dir_to_code/cifar10_config.yaml --data_dir=your_data_path --dataset="imagenet2012" --device_target="GPU" --pre_trained=./*-150-5004.ckpt > output.eval.log 2>&1 & | |||
| ``` | |||
| - 上述python命令在后台运行,可通过`output.eval.log`文件查看结果。准确率如下: | |||
| @@ -357,7 +476,7 @@ after allreduce eval: top5_correct=45582, tot=50000, acc=91.16% | |||
| ### [导出MindIR](#contents) | |||
| ```shell | |||
| python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] | |||
| python export.py --config_path [YMAL_CONFIG_PATH] --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] | |||
| ``` | |||
| 参数ckpt_file为必填项, | |||
| @@ -0,0 +1,105 @@ | |||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||
| enable_modelarts: False | |||
| # Url for modelarts | |||
| data_url: "" | |||
| train_url: "" | |||
| checkpoint_url: "" | |||
| # Path for local | |||
| data_path: "/cache/data" | |||
| output_path: "/cache/train" | |||
| load_path: "/cache/checkpoint_path" | |||
| device_target: "Ascend" | |||
| need_modelarts_dataset_unzip: True | |||
| modelarts_dataset_unzip_name: "cifar10" | |||
| # ============================================================================== | |||
| # options | |||
| num_classes: 10 | |||
| lr: 0.01 | |||
| lr_init: 0.01 | |||
| lr_max: 0.1 | |||
| lr_epochs: '30,60,90,120' | |||
| lr_scheduler: "step" | |||
| warmup_epochs: 5 | |||
| batch_size: 64 | |||
| max_epoch: 70 | |||
| momentum: 0.9 | |||
| weight_decay: 0.0005 # 5e-4 | |||
| loss_scale: 1.0 | |||
| label_smooth: 0 | |||
| label_smooth_factor: 0 | |||
| buffer_size: 10 | |||
| image_size: '224,224' | |||
| pad_mode: 'same' | |||
| padding: 0 | |||
| has_bias: False | |||
| batch_norm: True | |||
| keep_checkpoint_max: 10 | |||
| initialize_mode: "XavierUniform" | |||
| has_dropout: False | |||
| # train options | |||
| dataset: "cifar10" | |||
| data_dir: "" | |||
| pre_trained: "" | |||
| lr_gamma: 0.1 | |||
| eta_min: 0.0 | |||
| T_max: 90 | |||
| log_interval: 100 | |||
| ckpt_path: "outputs/" | |||
| ckpt_interval: 5 | |||
| is_save_on_master: 1 | |||
| is_distributed: 0 | |||
| # eval options | |||
| per_batch_size: 32 | |||
| graph_ckpt: 1 | |||
| log_path: "outputs/" | |||
| # postprocess options | |||
| result_dir: "" | |||
| label_dir: "" | |||
| dataset_name: "cifar10" | |||
| # preprocess options | |||
| result_path: "./preprocess_Result/" | |||
| # export options | |||
| ckpt_file: "" | |||
| file_name: "vgg16" | |||
| file_format: "AIR" | |||
| --- | |||
| # Help description for each configuration | |||
| device_target: "device where the code will be implemented." | |||
| dataset: "choices in ['cifar10', 'imagenet2012']" | |||
| data_dir: "data dir" | |||
| pre_trained: "model_path, local pretrained model to load" | |||
| lr_gamma: "decrease lr by a factor of exponential lr_scheduler" | |||
| eta_min: "eta_min in cosine_annealing scheduler" | |||
| T_max: "T-max in cosine_annealing scheduler" | |||
| log_interval: "logging interval" | |||
| ckpt_path: "checkpoint save location" | |||
| ckpt_interval: "ckpt_interval" | |||
| is_save_on_master: "save ckpt on master or all rank" | |||
| is_distributed: "if multi device" | |||
| # eval options | |||
| per_batch_size: "batch size for per npu" | |||
| graph_ckpt: "graph ckpt or feed ckpt" | |||
| log_path: "path to save log" | |||
| # postprocess options | |||
| result_dir: "result files path." | |||
| label_dir: "image file path." | |||
| dataset_name: "choices in ['cifar10', 'imagenet2012']" | |||
| # preprocess options | |||
| result_path: "result path" | |||
| # export options | |||
| ckpt_file: "vgg16 ckpt file." | |||
| file_name: "vgg16 output file name." | |||
| file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']" | |||
| @@ -15,13 +15,13 @@ | |||
| """Eval""" | |||
| import os | |||
| import time | |||
| import argparse | |||
| import datetime | |||
| import glob | |||
| import numpy as np | |||
| import mindspore.nn as nn | |||
| from mindspore import Tensor, context | |||
| from mindspore.communication.management import get_rank, get_group_size | |||
| from mindspore.nn.optim.momentum import Momentum | |||
| from mindspore.train.model import Model | |||
| from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
| @@ -34,6 +34,9 @@ from src.vgg import vgg16 | |||
| from src.dataset import vgg_create_dataset | |||
| from src.dataset import classification_dataset | |||
| from model_utils.moxing_adapter import config | |||
| from model_utils.moxing_adapter import moxing_wrapper | |||
| from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num | |||
| class ParameterReduce(nn.Cell): | |||
| """ParameterReduce""" | |||
| @@ -49,51 +52,6 @@ class ParameterReduce(nn.Cell): | |||
| return ret | |||
| def parse_args(cloud_args=None): | |||
| """parse_args""" | |||
| parser = argparse.ArgumentParser('mindspore classification test') | |||
| parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], | |||
| help='device where the code will be implemented. (Default: Ascend)') | |||
| # dataset related | |||
| parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10") | |||
| parser.add_argument('--data_path', type=str, default='', help='eval data dir') | |||
| parser.add_argument('--per_batch_size', default=32, type=int, help='batch size for per npu') | |||
| # network related | |||
| parser.add_argument('--graph_ckpt', type=int, default=1, help='graph ckpt or feed ckpt') | |||
| parser.add_argument('--pre_trained', default='', type=str, help='fully path of pretrained model to load. ' | |||
| 'If it is a direction, it will test all ckpt') | |||
| # logging related | |||
| parser.add_argument('--log_path', type=str, default='outputs/', help='path to save log') | |||
| parser.add_argument('--rank', type=int, default=0, help='local rank of distributed') | |||
| parser.add_argument('--group_size', type=int, default=1, help='world size of distributed') | |||
| args_opt = parser.parse_args() | |||
| args_opt = merge_args(args_opt, cloud_args) | |||
| if args_opt.dataset == "cifar10": | |||
| from src.config import cifar_cfg as cfg | |||
| else: | |||
| from src.config import imagenet_cfg as cfg | |||
| args_opt.image_size = cfg.image_size | |||
| args_opt.num_classes = cfg.num_classes | |||
| args_opt.per_batch_size = cfg.batch_size | |||
| args_opt.momentum = cfg.momentum | |||
| args_opt.weight_decay = cfg.weight_decay | |||
| args_opt.buffer_size = cfg.buffer_size | |||
| args_opt.pad_mode = cfg.pad_mode | |||
| args_opt.padding = cfg.padding | |||
| args_opt.has_bias = cfg.has_bias | |||
| args_opt.batch_norm = cfg.batch_norm | |||
| args_opt.initialize_mode = cfg.initialize_mode | |||
| args_opt.has_dropout = cfg.has_dropout | |||
| args_opt.image_size = list(map(int, args_opt.image_size.split(','))) | |||
| return args_opt | |||
| def get_top5_acc(top5_arg, gt_class): | |||
| sub_count = 0 | |||
| for top5, gt in zip(top5_arg, gt_class): | |||
| @@ -102,66 +60,122 @@ def get_top5_acc(top5_arg, gt_class): | |||
| return sub_count | |||
| def merge_args(args, cloud_args): | |||
| """merge_args""" | |||
| args_dict = vars(args) | |||
| if isinstance(cloud_args, dict): | |||
| for key in cloud_args.keys(): | |||
| val = cloud_args[key] | |||
| if key in args_dict and val: | |||
| arg_type = type(args_dict[key]) | |||
| if arg_type is not type(None): | |||
| val = arg_type(val) | |||
| args_dict[key] = val | |||
| return args | |||
| def test(cloud_args=None): | |||
| """test""" | |||
| args = parse_args(cloud_args) | |||
| _enable_graph_kernel = args.device_target == "GPU" | |||
| def modelarts_pre_process(): | |||
| '''modelarts pre process function.''' | |||
| def unzip(zip_file, save_dir): | |||
| import zipfile | |||
| s_time = time.time() | |||
| if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): | |||
| zip_isexist = zipfile.is_zipfile(zip_file) | |||
| if zip_isexist: | |||
| fz = zipfile.ZipFile(zip_file, 'r') | |||
| data_num = len(fz.namelist()) | |||
| print("Extract Start...") | |||
| print("unzip file num: {}".format(data_num)) | |||
| data_print = int(data_num / 100) if data_num > 100 else 1 | |||
| i = 0 | |||
| for file in fz.namelist(): | |||
| if i % data_print == 0: | |||
| print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) | |||
| i += 1 | |||
| fz.extract(file, save_dir) | |||
| print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), | |||
| int(int(time.time() - s_time) % 60))) | |||
| print("Extract Done.") | |||
| else: | |||
| print("This is not zip.") | |||
| else: | |||
| print("Zip has been extracted.") | |||
| if config.need_modelarts_dataset_unzip: | |||
| zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip") | |||
| save_dir_1 = os.path.join(config.data_path) | |||
| sync_lock = "/tmp/unzip_sync.lock" | |||
| # Each server contains 8 devices as most. | |||
| if config.device_target == "GPU": | |||
| device_id = get_rank() | |||
| device_num = get_group_size() | |||
| elif config.device_target == "Ascend": | |||
| device_id = get_device_id() | |||
| device_num = get_device_num() | |||
| else: | |||
| raise ValueError("Not support device_target.") | |||
| # Each server contains 8 devices as most. | |||
| if device_id % min(device_num, 8) == 0 and not os.path.exists(sync_lock): | |||
| print("Zip file path: ", zip_file_1) | |||
| print("Unzip file save dir: ", save_dir_1) | |||
| unzip(zip_file_1, save_dir_1) | |||
| print("===Finish extract data synchronization===") | |||
| try: | |||
| os.mknod(sync_lock) | |||
| except IOError: | |||
| pass | |||
| while True: | |||
| if os.path.exists(sync_lock): | |||
| break | |||
| time.sleep(1) | |||
| print("Device: {}, Finish sync unzip data from {} to {}.".format(device_id, zip_file_1, save_dir_1)) | |||
| config.log_path = os.path.join(config.output_path, config.log_path) | |||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||
| def run_eval(): | |||
| """run eval""" | |||
| config.per_batch_size = config.batch_size | |||
| config.image_size = list(map(int, config.image_size.split(','))) | |||
| config.rank = get_rank_id() | |||
| config.group_size = get_device_num() | |||
| _enable_graph_kernel = config.device_target == "GPU" | |||
| context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=_enable_graph_kernel, | |||
| enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) | |||
| if os.getenv('DEVICE_ID', "not_set").isdigit() and args.device_target == "Ascend": | |||
| enable_auto_mixed_precision=True, device_target=config.device_target, save_graphs=False) | |||
| if os.getenv('DEVICE_ID', "not_set").isdigit() and config.device_target == "Ascend": | |||
| context.set_context(device_id=int(os.getenv('DEVICE_ID'))) | |||
| args.outputs_dir = os.path.join(args.log_path, | |||
| datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) | |||
| config.outputs_dir = os.path.join(config.log_path, | |||
| datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) | |||
| args.logger = get_logger(args.outputs_dir, args.rank) | |||
| args.logger.save_args(args) | |||
| config.logger = get_logger(config.outputs_dir, config.rank) | |||
| config.logger.save_args(config) | |||
| if args.dataset == "cifar10": | |||
| net = vgg16(num_classes=args.num_classes, args=args) | |||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, args.momentum, | |||
| weight_decay=args.weight_decay) | |||
| if config.dataset == "cifar10": | |||
| net = vgg16(num_classes=config.num_classes, args=config) | |||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, config.momentum, | |||
| weight_decay=config.weight_decay) | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) | |||
| param_dict = load_checkpoint(args.pre_trained) | |||
| param_dict = load_checkpoint(config.pre_trained) | |||
| load_param_into_net(net, param_dict) | |||
| net.set_train(False) | |||
| dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) | |||
| dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size, training=False) | |||
| res = model.eval(dataset) | |||
| print("result: ", res) | |||
| else: | |||
| # network | |||
| args.logger.important_info('start create network') | |||
| if os.path.isdir(args.pre_trained): | |||
| models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt'))) | |||
| config.logger.important_info('start create network') | |||
| if os.path.isdir(config.pre_trained): | |||
| models = list(glob.glob(os.path.join(config.pre_trained, '*.ckpt'))) | |||
| print(models) | |||
| if args.graph_ckpt: | |||
| if config.graph_ckpt: | |||
| f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0]) | |||
| else: | |||
| f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) | |||
| args.models = sorted(models, key=f) | |||
| config.models = sorted(models, key=f) | |||
| else: | |||
| args.models = [args.pre_trained,] | |||
| config.models = [config.pre_trained,] | |||
| for model in args.models: | |||
| dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval') | |||
| for model in config.models: | |||
| dataset = classification_dataset(config.data_dir, config.image_size, config.per_batch_size, mode='eval') | |||
| eval_dataloader = dataset.create_tuple_iterator(output_numpy=True, num_epochs=1) | |||
| network = vgg16(args.num_classes, args, phase="test") | |||
| network = vgg16(config.num_classes, config, phase="test") | |||
| # pre_trained | |||
| load_param_into_net(network, load_checkpoint(model)) | |||
| @@ -184,30 +198,30 @@ def test(cloud_args=None): | |||
| t1_correct = np.equal(top1_output, gt_classes).sum() | |||
| top1_correct += t1_correct | |||
| top5_correct += get_top5_acc(top5_output, gt_classes) | |||
| img_tot += args.per_batch_size | |||
| img_tot += config.per_batch_size | |||
| if args.rank == 0 and it == 0: | |||
| if config.rank == 0 and it == 0: | |||
| t_end = time.time() | |||
| it = 1 | |||
| if args.rank == 0: | |||
| if config.rank == 0: | |||
| time_used = time.time() - t_end | |||
| fps = (img_tot - args.per_batch_size) * args.group_size / time_used | |||
| args.logger.info('Inference Performance: {:.2f} img/sec'.format(fps)) | |||
| fps = (img_tot - config.per_batch_size) * config.group_size / time_used | |||
| config.logger.info('Inference Performance: {:.2f} img/sec'.format(fps)) | |||
| results = [[top1_correct], [top5_correct], [img_tot]] | |||
| args.logger.info('before results={}'.format(results)) | |||
| config.logger.info('before results=%s', results) | |||
| results = np.array(results) | |||
| args.logger.info('after results={}'.format(results)) | |||
| config.logger.info('after results=%s', results) | |||
| top1_correct = results[0, 0] | |||
| top5_correct = results[1, 0] | |||
| img_tot = results[2, 0] | |||
| acc1 = 100.0 * top1_correct / img_tot | |||
| acc5 = 100.0 * top5_correct / img_tot | |||
| args.logger.info('after allreduce eval: top1_correct={}, tot={},' | |||
| 'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1)) | |||
| args.logger.info('after allreduce eval: top5_correct={}, tot={},' | |||
| 'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5)) | |||
| config.logger.info('after allreduce eval: top1_correct={}, tot={},' | |||
| 'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1)) | |||
| config.logger.info('after allreduce eval: top5_correct={}, tot={},' | |||
| 'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5)) | |||
| if __name__ == "__main__": | |||
| test() | |||
| run_eval() | |||
| @@ -13,7 +13,6 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """export checkpoint file into models""" | |||
| import argparse | |||
| import numpy as np | |||
| from mindspore import Tensor, context | |||
| @@ -22,42 +21,29 @@ from mindspore.train.serialization import load_checkpoint, export | |||
| from src.vgg import vgg16 | |||
| parser = argparse.ArgumentParser(description='VGG16 export') | |||
| parser.add_argument("--device_id", type=int, default=0, help="Device id") | |||
| parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10", help='ckpt file') | |||
| parser.add_argument('--ckpt_file', type=str, required=True, help='vgg16 ckpt file.') | |||
| parser.add_argument('--file_name', type=str, default='vgg16', help='vgg16 output file name.') | |||
| parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', help='file format') | |||
| parser.add_argument("--device_target", type=str, choices=["Ascend", "GPU", "CPU"], default="Ascend", | |||
| help="device target") | |||
| args = parser.parse_args() | |||
| if args.dataset == "cifar10": | |||
| from src.config import cifar_cfg as cfg | |||
| else: | |||
| from src.config import imagenet_cfg as cfg | |||
| args.num_classes = cfg.num_classes | |||
| args.pad_mode = cfg.pad_mode | |||
| args.padding = cfg.padding | |||
| args.has_bias = cfg.has_bias | |||
| args.initialize_mode = cfg.initialize_mode | |||
| args.batch_norm = cfg.batch_norm | |||
| args.has_dropout = cfg.has_dropout | |||
| args.image_size = list(map(int, cfg.image_size.split(','))) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
| if args.device_target == "Ascend": | |||
| context.set_context(device_id=args.device_id) | |||
| from model_utils.moxing_adapter import config | |||
| from model_utils.device_adapter import get_device_id | |||
| if __name__ == '__main__': | |||
| if args.dataset == "cifar10": | |||
| net = vgg16(num_classes=args.num_classes, args=args) | |||
| def run_export(): | |||
| config.image_size = list(map(int, config.image_size.split(','))) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) | |||
| if config.device_target == "Ascend": | |||
| config.device_id = get_device_id() | |||
| context.set_context(device_id=config.device_id) | |||
| if config.dataset == "cifar10": | |||
| net = vgg16(num_classes=config.num_classes, args=config) | |||
| else: | |||
| net = vgg16(args.num_classes, args, phase="test") | |||
| net = vgg16(config.num_classes, config, phase="test") | |||
| load_checkpoint(args.ckpt_file, net=net) | |||
| load_checkpoint(config.ckpt_file, net=net) | |||
| net.set_train(False) | |||
| input_data = Tensor(np.zeros([cfg.batch_size, 3, args.image_size[0], args.image_size[1]]), mstype.float32) | |||
| export(net, input_data, file_name=args.file_name, file_format=args.file_format) | |||
| input_data = Tensor(np.zeros([config.batch_size, 3, config.image_size[0], config.image_size[1]]), mstype.float32) | |||
| export(net, input_data, file_name=config.file_name, file_format=config.file_format) | |||
| if __name__ == '__main__': | |||
| run_export() | |||
| @@ -0,0 +1,104 @@ | |||
| # Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) | |||
| enable_modelarts: False | |||
| # Url for modelarts | |||
| data_url: "" | |||
| train_url: "" | |||
| checkpoint_url: "" | |||
| # Path for local | |||
| data_path: "/cache/data" | |||
| output_path: "/cache/train" | |||
| load_path: "/cache/checkpoint_path" | |||
| device_target: "Ascend" | |||
| need_modelarts_dataset_unzip: True | |||
| modelarts_dataset_unzip_name: "ImageNet" | |||
| # ============================================================================== | |||
| # options | |||
| num_classes: 1000 | |||
| lr: 0.04 | |||
| lr_init: 0.01 | |||
| lr_max: 0.1 | |||
| lr_epochs: '30,60,90,120' | |||
| lr_scheduler: 'cosine_annealing' | |||
| warmup_epochs: 0 | |||
| batch_size: 64 | |||
| max_epoch: 90 | |||
| momentum: 0.9 | |||
| weight_decay: 0.0001 # 1e-4 | |||
| loss_scale: 1024 | |||
| label_smooth: 1 | |||
| label_smooth_factor: 0.1 | |||
| buffer_size: 10 | |||
| image_size: '224,224' | |||
| pad_mode: 'pad' | |||
| padding: 1 | |||
| has_bias: False | |||
| batch_norm: False | |||
| keep_checkpoint_max: 10 | |||
| initialize_mode: "KaimingNormal" | |||
| has_dropout: True | |||
| # train option | |||
| dataset: "imagenet2012" | |||
| data_dir: "" | |||
| pre_trained: "" | |||
| lr_gamma: 0.1 | |||
| eta_min: 0.0 | |||
| T_max: 90 | |||
| log_interval: 100 | |||
| ckpt_path: "outputs/" | |||
| ckpt_interval: 5 | |||
| is_save_on_master: 1 | |||
| is_distributed: 0 | |||
| # eval options | |||
| per_batch_size: 32 | |||
| graph_ckpt: 1 | |||
| log_path: "outputs/" | |||
| # postprocess options | |||
| result_dir: "" | |||
| label_dir: "" | |||
| dataset_name: "imagenet2012" | |||
| # preprocess options | |||
| result_path: "./preprocess_Result/" | |||
| # export options | |||
| ckpt_file: "" | |||
| file_name: "vgg16" | |||
| file_format: "AIR" | |||
| --- | |||
| # Help description for each configuration | |||
| device_target: "device where the code will be implemented." | |||
| dataset: "choices in ['cifar10', 'imagenet2012']" | |||
| data_dir: "data dir" | |||
| pre_trained: "model_path, local pretrained model to load" | |||
| lr_gamma: "decrease lr by a factor of exponential lr_scheduler" | |||
| eta_min: "eta_min in cosine_annealing scheduler" | |||
| T_max: "T-max in cosine_annealing scheduler" | |||
| log_interval: "logging interval" | |||
| ckpt_path: "checkpoint save location" | |||
| ckpt_interval: "ckpt_interval" | |||
| is_save_on_master: "save ckpt on master or all rank" | |||
| is_distributed: "if multi device" | |||
| # eval options | |||
| per_batch_size: "batch size for per npu" | |||
| graph_ckpt: "graph ckpt or feed ckpt" | |||
| log_path: "path to save log" | |||
| # postprocess options | |||
| result_dir: "result files path." | |||
| label_dir: "image file path." | |||
| dataset_name: "choices in ['cifar10', 'imagenet2012']" | |||
| # preprocess options | |||
| result_path: "result path" | |||
| # export options | |||
| ckpt_file: "vgg16 ckpt file." | |||
| file_name: "vgg16 output file name." | |||
| file_format: "file format, choices in ['AIR', 'ONNX', 'MINDIR']" | |||
| @@ -14,7 +14,7 @@ | |||
| # ============================================================================ | |||
| """hub config.""" | |||
| from src.vgg import vgg16 as VGG16 | |||
| from model_utils.moxing_adapter import config | |||
| def vgg16(*args, **kwargs): | |||
| return VGG16(*args, **kwargs) | |||
| @@ -22,5 +22,5 @@ def vgg16(*args, **kwargs): | |||
| def create_network(name, *args, **kwargs): | |||
| if name == "vgg16": | |||
| return vgg16(*args, **kwargs) | |||
| return vgg16(args=config, *args, **kwargs) | |||
| raise NotImplementedError(f"{name} is not implemented in the repo") | |||
| @@ -0,0 +1,136 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Parse arguments""" | |||
| import os | |||
| import ast | |||
| import argparse | |||
| from pprint import pformat | |||
| import yaml | |||
| class Config: | |||
| """ | |||
| Configuration namespace. Convert dictionary to members. | |||
| """ | |||
| def __init__(self, cfg_dict): | |||
| for k, v in cfg_dict.items(): | |||
| if isinstance(v, (list, tuple)): | |||
| setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v]) | |||
| else: | |||
| setattr(self, k, Config(v) if isinstance(v, dict) else v) | |||
| def __str__(self): | |||
| return pformat(self.__dict__) | |||
| def __repr__(self): | |||
| return self.__str__() | |||
| def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"): | |||
| """ | |||
| Parse command line arguments to the configuration according to the default yaml. | |||
| Args: | |||
| parser: Parent parser. | |||
| cfg: Base configuration. | |||
| helper: Helper description. | |||
| cfg_path: Path to the default yaml config. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]", | |||
| parents=[parser]) | |||
| helper = {} if helper is None else helper | |||
| choices = {} if choices is None else choices | |||
| for item in cfg: | |||
| if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict): | |||
| help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path) | |||
| choice = choices[item] if item in choices else None | |||
| if isinstance(cfg[item], bool): | |||
| parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| else: | |||
| parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice, | |||
| help=help_description) | |||
| args = parser.parse_args() | |||
| return args | |||
| def parse_yaml(yaml_path): | |||
| """ | |||
| Parse the yaml config file. | |||
| Args: | |||
| yaml_path: Path to the yaml config. | |||
| """ | |||
| with open(yaml_path, 'r') as fin: | |||
| try: | |||
| cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader) | |||
| cfgs = [x for x in cfgs] | |||
| if len(cfgs) == 1: | |||
| cfg_helper = {} | |||
| cfg = cfgs[0] | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 2: | |||
| cfg, cfg_helper = cfgs | |||
| cfg_choices = {} | |||
| elif len(cfgs) == 3: | |||
| cfg, cfg_helper, cfg_choices = cfgs | |||
| else: | |||
| raise ValueError("At most 3 docs (config, description for help, choices) are supported in config yaml") | |||
| print(cfg_helper) | |||
| except: | |||
| raise ValueError("Failed to parse yaml") | |||
| return cfg, cfg_helper, cfg_choices | |||
| def merge(args, cfg): | |||
| """ | |||
| Merge the base config from yaml file and command line arguments. | |||
| Args: | |||
| args: Command line arguments. | |||
| cfg: Base configuration. | |||
| """ | |||
| args_var = vars(args) | |||
| for item in args_var: | |||
| cfg[item] = args_var[item] | |||
| return cfg | |||
| def get_config(): | |||
| """ | |||
| Get Config according to the yaml file and cli arguments. | |||
| """ | |||
| parser = argparse.ArgumentParser(description="default name", add_help=False) | |||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||
| parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../cifar10_config.yaml"), | |||
| help="Config file path") | |||
| path_args, _ = parser.parse_known_args() | |||
| default, helper, choices = parse_yaml(path_args.config_path) | |||
| args = parse_cli_to_yaml(parser=parser, cfg=default, helper=helper, choices=choices, cfg_path=path_args.config_path) | |||
| final_config = merge(args, default) | |||
| return Config(final_config) | |||
| # -------------------------------------------------------------------------------------------------------------------- | |||
| def get_config_static(config_path="../cifar10_config.yaml"): | |||
| """ | |||
| Get Config according to the yaml file and cli arguments. | |||
| """ | |||
| if not config_path.startswith("/"): | |||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |||
| config_path = os.path.join(current_dir, config_path) | |||
| final_config, _, _ = parse_yaml(config_path) | |||
| return Config(final_config) | |||
| @@ -0,0 +1,27 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Device adapter for ModelArts""" | |||
| from .moxing_adapter import config | |||
| if config.enable_modelarts: | |||
| from .moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| else: | |||
| from .local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id | |||
| __all__ = [ | |||
| "get_device_id", "get_device_num", "get_rank_id", "get_job_id" | |||
| ] | |||
| @@ -0,0 +1,36 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Local adapter""" | |||
| import os | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| return "Local Job" | |||
| @@ -0,0 +1,118 @@ | |||
| # Copyright 2021 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """Moxing adapter for ModelArts""" | |||
| import os | |||
| import functools | |||
| from mindspore import context | |||
| from .config import get_config | |||
| config = get_config() | |||
| _global_sync_count = 0 | |||
| def get_device_id(): | |||
| device_id = os.getenv('DEVICE_ID', '0') | |||
| return int(device_id) | |||
| def get_device_num(): | |||
| device_num = os.getenv('RANK_SIZE', '1') | |||
| return int(device_num) | |||
| def get_rank_id(): | |||
| global_rank_id = os.getenv('RANK_ID', '0') | |||
| return int(global_rank_id) | |||
| def get_job_id(): | |||
| job_id = os.getenv('JOB_ID') | |||
| job_id = job_id if job_id != "" else "default" | |||
| return job_id | |||
| def sync_data(from_path, to_path): | |||
| """ | |||
| Download data from remote obs to local directory if the first url is remote url and the second one is local path | |||
| Upload data from local directory to remote obs in contrast. | |||
| """ | |||
| import moxing as mox | |||
| import time | |||
| global _global_sync_count | |||
| sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count) | |||
| _global_sync_count += 1 | |||
| # Each server contains 8 devices as most. | |||
| if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock): | |||
| print("from path: ", from_path) | |||
| print("to path: ", to_path) | |||
| mox.file.copy_parallel(from_path, to_path) | |||
| print("===finish data synchronization===") | |||
| try: | |||
| os.mknod(sync_lock) | |||
| except IOError: | |||
| pass | |||
| print("===save flag===") | |||
| while True: | |||
| if os.path.exists(sync_lock): | |||
| break | |||
| time.sleep(1) | |||
| print("Finish sync data from {} to {}.".format(from_path, to_path)) | |||
| def moxing_wrapper(pre_process=None, post_process=None): | |||
| """ | |||
| Moxing wrapper to download dataset and upload outputs. | |||
| """ | |||
| def wrapper(run_func): | |||
| @functools.wraps(run_func) | |||
| def wrapped_func(*args, **kwargs): | |||
| # Download data from data_url | |||
| if config.enable_modelarts: | |||
| if config.data_url: | |||
| sync_data(config.data_url, config.data_path) | |||
| print("Dataset downloaded: ", os.listdir(config.data_path)) | |||
| if config.checkpoint_url: | |||
| sync_data(config.checkpoint_url, config.load_path) | |||
| print("Preload downloaded: ", os.listdir(config.load_path)) | |||
| if config.train_url: | |||
| sync_data(config.train_url, config.output_path) | |||
| print("Workspace downloaded: ", os.listdir(config.output_path)) | |||
| context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id()))) | |||
| config.device_num = get_device_num() | |||
| config.device_id = get_device_id() | |||
| if not os.path.exists(config.output_path): | |||
| os.makedirs(config.output_path) | |||
| if pre_process: | |||
| pre_process() | |||
| # Run the main function | |||
| run_func(*args, **kwargs) | |||
| # Upload data to train_url | |||
| if config.enable_modelarts: | |||
| if post_process: | |||
| post_process() | |||
| if config.train_url: | |||
| print("Start to copy output directory") | |||
| sync_data(config.output_path, config.train_url) | |||
| return wrapped_func | |||
| return wrapper | |||
| @@ -15,39 +15,31 @@ | |||
| """postprocess for 310 inference""" | |||
| import os | |||
| import json | |||
| import argparse | |||
| import numpy as np | |||
| from mindspore.nn import Top1CategoricalAccuracy, Top5CategoricalAccuracy | |||
| parser = argparse.ArgumentParser(description="postprocess") | |||
| parser.add_argument("--result_dir", type=str, required=True, help="result files path.") | |||
| parser.add_argument("--label_dir", type=str, required=True, help="image file path.") | |||
| parser.add_argument('--dataset_name', type=str, choices=["cifar10", "imagenet2012"], default="cifar10") | |||
| args = parser.parse_args() | |||
| from model_utils.moxing_adapter import config | |||
| if __name__ == '__main__': | |||
| top1_acc = Top1CategoricalAccuracy() | |||
| rst_path = args.result_dir | |||
| if args.dataset_name == "cifar10": | |||
| from src.config import cifar_cfg as cfg | |||
| labels = np.load(args.label_dir, allow_pickle=True) | |||
| rst_path = config.result_dir | |||
| if config.dataset_name == "cifar10": | |||
| labels = np.load(config.label_dir, allow_pickle=True) | |||
| for idx, label in enumerate(labels): | |||
| f_name = os.path.join(rst_path, "VGG16_data_bs" + str(cfg.batch_size) + "_" + str(idx) + "_0.bin") | |||
| f_name = os.path.join(rst_path, "VGG16_data_bs" + str(config.batch_size) + "_" + str(idx) + "_0.bin") | |||
| pred = np.fromfile(f_name, np.float32) | |||
| pred = pred.reshape(cfg.batch_size, int(pred.shape[0] / cfg.batch_size)) | |||
| pred = pred.reshape(config.batch_size, int(pred.shape[0] / config.batch_size)) | |||
| top1_acc.update(pred, labels[idx]) | |||
| print("acc: ", top1_acc.eval()) | |||
| else: | |||
| from src.config import imagenet_cfg as cfg | |||
| top5_acc = Top5CategoricalAccuracy() | |||
| file_list = os.listdir(rst_path) | |||
| with open(args.label_dir, "r") as label: | |||
| with open(config.label_dir, "r") as label: | |||
| labels = json.load(label) | |||
| for f in file_list: | |||
| label = f.split("_0.bin")[0] + ".JPEG" | |||
| pred = np.fromfile(os.path.join(rst_path, f), np.float32) | |||
| pred = pred.reshape(cfg.batch_size, int(pred.shape[0] / cfg.batch_size)) | |||
| pred = pred.reshape(config.batch_size, int(pred.shape[0] / config.batch_size)) | |||
| top1_acc.update(pred, [labels[label],]) | |||
| top5_acc.update(pred, [labels[label],]) | |||
| print("Top1 acc: ", top1_acc.eval()) | |||
| @@ -14,11 +14,13 @@ | |||
| # ============================================================================ | |||
| """preprocess""" | |||
| import os | |||
| import argparse | |||
| import json | |||
| import numpy as np | |||
| from src.dataset import vgg_create_dataset | |||
| from model_utils.moxing_adapter import config | |||
| def create_label(result_path, dir_path): | |||
| print("[WARNING] Create imagenet label. Currently only use for Imagenet2012!") | |||
| dirs = os.listdir(dir_path) | |||
| @@ -41,33 +43,22 @@ def create_label(result_path, dir_path): | |||
| print("[INFO] Completed! Total {} data.".format(total)) | |||
| parser = argparse.ArgumentParser('preprocess') | |||
| parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10") | |||
| parser.add_argument('--data_path', type=str, default='', help='eval data dir') | |||
| parser.add_argument('--result_path', type=str, default='./preprocess_Result/', help='result path') | |||
| args = parser.parse_args() | |||
| if args.dataset == "cifar10": | |||
| from src.config import cifar_cfg as cfg | |||
| else: | |||
| from src.config import imagenet_cfg as cfg | |||
| args.per_batch_size = cfg.batch_size | |||
| args.image_size = list(map(int, cfg.image_size.split(','))) | |||
| config.per_batch_size = config.batch_size | |||
| config.image_size = list(map(int, config.image_size.split(','))) | |||
| if __name__ == "__main__": | |||
| if args.dataset == "cifar10": | |||
| dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) | |||
| img_path = os.path.join(args.result_path, "00_data") | |||
| if config.dataset == "cifar10": | |||
| dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size, training=False) | |||
| img_path = os.path.join(config.result_path, "00_data") | |||
| os.makedirs(img_path) | |||
| label_list = [] | |||
| for idx, data in enumerate(dataset.create_dict_iterator(output_numpy=True)): | |||
| file_name = "VGG16_data_bs" + str(args.per_batch_size) + "_" + str(idx) + ".bin" | |||
| file_name = "VGG16_data_bs" + str(config.per_batch_size) + "_" + str(idx) + ".bin" | |||
| file_path = os.path.join(img_path, file_name) | |||
| data["image"].tofile(file_path) | |||
| label_list.append(data["label"]) | |||
| np.save(os.path.join(args.result_path, "cifar10_label_ids.npy"), label_list) | |||
| np.save(os.path.join(config.result_path, "cifar10_label_ids.npy"), label_list) | |||
| print("=" * 20, "export bin files finished", "=" * 20) | |||
| else: | |||
| create_label(args.result_path, args.data_path) | |||
| create_label(config.result_path, config.data_dir) | |||
| @@ -32,6 +32,13 @@ then | |||
| exit 1 | |||
| fi | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| dataset_type='cifar10' | |||
| if [ $# == 3 ] | |||
| @@ -43,6 +50,8 @@ then | |||
| fi | |||
| dataset_type=$3 | |||
| fi | |||
| config_path=$(get_real_path "./${dataset_type}_config.yaml") | |||
| echo "config path is : ${config_path}" | |||
| export DEVICE_NUM=8 | |||
| export RANK_SIZE=8 | |||
| @@ -68,10 +77,12 @@ do | |||
| rm -rf ./train_parallel$DEVICE_ID | |||
| mkdir ./train_parallel$DEVICE_ID | |||
| cp $src_dir/*.py ./train_parallel$DEVICE_ID | |||
| cp $src_dir/*.yaml ./train_parallel$DEVICE_ID | |||
| cp -r $src_dir/src ./train_parallel$DEVICE_ID | |||
| cp -r $src_dir/model_utils ./train_parallel$DEVICE_ID | |||
| cd ./train_parallel$DEVICE_ID || exit | |||
| echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type" | |||
| env > env.log | |||
| taskset -c $cmdopt python train.py --data_path=$2 --device_target="Ascend" --device_id=$DEVICE_ID --is_distributed=1 --dataset=$dataset_type &> log & | |||
| taskset -c $cmdopt python train.py --config_path=$config_path --data_dir=$2 --device_target="Ascend" --is_distributed=1 --dataset=$dataset_type &> log & | |||
| cd .. | |||
| done | |||
| @@ -22,9 +22,19 @@ echo "========================================================================== | |||
| DATA_PATH=$1 | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| config_path=$(get_real_path "./imagenet2012_config.yaml") | |||
| mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \ | |||
| python train.py \ | |||
| --config_path=$config_path \ | |||
| --device_target="GPU" \ | |||
| --dataset="imagenet2012" \ | |||
| --is_distributed=1 \ | |||
| --data_path=$DATA_PATH > output.train.log 2>&1 & | |||
| --data_dir=$DATA_PATH > output.train.log 2>&1 & | |||
| @@ -25,8 +25,20 @@ DATASET_TYPE=$2 | |||
| DEVICE_TYPE=$3 | |||
| CHECKPOINT_PATH=$4 | |||
| get_real_path(){ | |||
| if [ "${1:0:1}" == "/" ]; then | |||
| echo "$1" | |||
| else | |||
| echo "$(realpath -m $PWD/$1)" | |||
| fi | |||
| } | |||
| config_path=$(get_real_path "./${DATASET_TYPE}_config.yaml") | |||
| echo "config path is : ${config_path}" | |||
| python eval.py \ | |||
| --data_path=$DATA_PATH \ | |||
| --config_path=$config_path \ | |||
| --data_dir=$DATA_PATH \ | |||
| --dataset=$DATASET_TYPE \ | |||
| --device_target=$DEVICE_TYPE \ | |||
| --pre_trained=$CHECKPOINT_PATH > output.eval.log 2>&1 & | |||
| @@ -36,6 +36,8 @@ else | |||
| echo "DATASET_NAME can choose from ['cifar10', 'imagenet2012']" | |||
| exit 1 | |||
| fi | |||
| config_path=$(get_real_path "../${dataset_name}_config.yaml") | |||
| echo "config path is : ${config_path}" | |||
| dataset_path=$(get_real_path $3) | |||
| @@ -77,7 +79,7 @@ function preprocess_data() | |||
| rm -rf ./preprocess_Result | |||
| fi | |||
| mkdir preprocess_Result | |||
| python3.7 ../preprocess.py --dataset=$dataset_name --data_path=$dataset_path --result_path=./preprocess_Result/ | |||
| python3.7 ../preprocess.py --config_path=$config_path --dataset=$dataset_name --data_dir=$dataset_path --result_path=./preprocess_Result/ | |||
| } | |||
| function compile_app() | |||
| @@ -108,9 +110,9 @@ function infer() | |||
| function cal_acc() | |||
| { | |||
| if [ "$dataset_name" == "cifar10" ]; then | |||
| python3.7 ../postprocess.py --result_dir=./result_Files --label_dir=./preprocess_Result/cifar10_label_ids.npy --dataset_name=$dataset_name &> acc.log | |||
| python3.7 ../postprocess.py --config_path=$config_path --result_dir=./result_Files --label_dir=./preprocess_Result/cifar10_label_ids.npy --dataset_name=$dataset_name &> acc.log | |||
| else | |||
| python3.7 ../postprocess.py --result_dir=./result_Files --label_dir=./preprocess_Result/imagenet_label.json --dataset_name=$dataset_name &> acc.log | |||
| python3.7 ../postprocess.py --config_path=$config_path --result_dir=./result_Files --label_dir=./preprocess_Result/imagenet_label.json --dataset_name=$dataset_name &> acc.log | |||
| fi | |||
| } | |||
| @@ -1,72 +0,0 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """ | |||
| network config setting, will be used in train.py and eval.py | |||
| """ | |||
| from easydict import EasyDict as edict | |||
| # config for vgg16, cifar10 | |||
| cifar_cfg = edict({ | |||
| "num_classes": 10, | |||
| "lr": 0.01, | |||
| "lr_init": 0.01, | |||
| "lr_max": 0.1, | |||
| "lr_epochs": '30,60,90,120', | |||
| "lr_scheduler": "step", | |||
| "warmup_epochs": 5, | |||
| "batch_size": 64, | |||
| "max_epoch": 70, | |||
| "momentum": 0.9, | |||
| "weight_decay": 5e-4, | |||
| "loss_scale": 1.0, | |||
| "label_smooth": 0, | |||
| "label_smooth_factor": 0, | |||
| "buffer_size": 10, | |||
| "image_size": '224,224', | |||
| "pad_mode": 'same', | |||
| "padding": 0, | |||
| "has_bias": False, | |||
| "batch_norm": True, | |||
| "keep_checkpoint_max": 10, | |||
| "initialize_mode": "XavierUniform", | |||
| "has_dropout": False | |||
| }) | |||
| # config for vgg16, imagenet2012 | |||
| imagenet_cfg = edict({ | |||
| "num_classes": 1000, | |||
| "lr": 0.04, | |||
| "lr_init": 0.01, | |||
| "lr_max": 0.1, | |||
| "lr_epochs": '30,60,90,120', | |||
| "lr_scheduler": 'cosine_annealing', | |||
| "warmup_epochs": 0, | |||
| "batch_size": 64, | |||
| "max_epoch": 90, | |||
| "momentum": 0.9, | |||
| "weight_decay": 1e-4, | |||
| "loss_scale": 1024, | |||
| "label_smooth": 1, | |||
| "label_smooth_factor": 0.1, | |||
| "buffer_size": 10, | |||
| "image_size": '224,224', | |||
| "pad_mode": 'pad', | |||
| "padding": 1, | |||
| "has_bias": False, | |||
| "batch_norm": False, | |||
| "keep_checkpoint_max": 10, | |||
| "initialize_mode": "KaimingNormal", | |||
| "has_dropout": True | |||
| }) | |||
| @@ -142,9 +142,5 @@ def vgg16(num_classes=1000, args=None, phase="train", **kwargs): | |||
| Examples: | |||
| >>> vgg16(num_classes=1000, args=args, **kwargs) | |||
| """ | |||
| if args is None: | |||
| from .config import cifar_cfg | |||
| args = cifar_cfg | |||
| net = Vgg(cfg['16'], num_classes=num_classes, args=args, batch_norm=args.batch_norm, phase=phase, **kwargs) | |||
| return net | |||
| @@ -15,9 +15,9 @@ | |||
| """ | |||
| #################train vgg16 example on cifar10######################## | |||
| """ | |||
| import argparse | |||
| import datetime | |||
| import os | |||
| import time | |||
| import mindspore.nn as nn | |||
| from mindspore import Tensor | |||
| @@ -41,197 +41,193 @@ from src.utils.logging import get_logger | |||
| from src.utils.util import get_param_groups | |||
| from src.vgg import vgg16 | |||
| from model_utils.moxing_adapter import config | |||
| from model_utils.moxing_adapter import moxing_wrapper | |||
| from model_utils.device_adapter import get_device_id, get_rank_id, get_device_num | |||
| set_seed(1) | |||
| def parse_args(cloud_args=None): | |||
| """parameters""" | |||
| parser = argparse.ArgumentParser('mindspore classification training') | |||
| parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], | |||
| help='device where the code will be implemented. (Default: Ascend)') | |||
| parser.add_argument('--device_id', type=int, default=1, help='device id of GPU or Ascend. (Default: None)') | |||
| # dataset related | |||
| parser.add_argument('--dataset', type=str, choices=["cifar10", "imagenet2012"], default="cifar10") | |||
| parser.add_argument('--data_path', type=str, default='', help='train data dir') | |||
| # network related | |||
| parser.add_argument('--pre_trained', default='', type=str, help='model_path, local pretrained model to load') | |||
| parser.add_argument('--lr_gamma', type=float, default=0.1, | |||
| help='decrease lr by a factor of exponential lr_scheduler') | |||
| parser.add_argument('--eta_min', type=float, default=0., help='eta_min in cosine_annealing scheduler') | |||
| parser.add_argument('--T_max', type=int, default=90, help='T-max in cosine_annealing scheduler') | |||
| # logging and checkpoint related | |||
| parser.add_argument('--log_interval', type=int, default=100, help='logging interval') | |||
| parser.add_argument('--ckpt_path', type=str, default='outputs/', help='checkpoint save location') | |||
| parser.add_argument('--ckpt_interval', type=int, default=5, help='ckpt_interval') | |||
| parser.add_argument('--is_save_on_master', type=int, default=1, help='save ckpt on master or all rank') | |||
| # distributed related | |||
| parser.add_argument('--is_distributed', type=int, default=0, help='if multi device') | |||
| parser.add_argument('--rank', type=int, default=0, help='local rank of distributed') | |||
| parser.add_argument('--group_size', type=int, default=1, help='world size of distributed') | |||
| args_opt = parser.parse_args() | |||
| args_opt = merge_args(args_opt, cloud_args) | |||
| if args_opt.dataset == "cifar10": | |||
| from src.config import cifar_cfg as cfg | |||
| else: | |||
| from src.config import imagenet_cfg as cfg | |||
| args_opt.label_smooth = cfg.label_smooth | |||
| args_opt.label_smooth_factor = cfg.label_smooth_factor | |||
| args_opt.lr_scheduler = cfg.lr_scheduler | |||
| args_opt.loss_scale = cfg.loss_scale | |||
| args_opt.max_epoch = cfg.max_epoch | |||
| args_opt.warmup_epochs = cfg.warmup_epochs | |||
| args_opt.lr = cfg.lr | |||
| args_opt.lr_init = cfg.lr_init | |||
| args_opt.lr_max = cfg.lr_max | |||
| args_opt.momentum = cfg.momentum | |||
| args_opt.weight_decay = cfg.weight_decay | |||
| args_opt.per_batch_size = cfg.batch_size | |||
| args_opt.num_classes = cfg.num_classes | |||
| args_opt.buffer_size = cfg.buffer_size | |||
| args_opt.ckpt_save_max = cfg.keep_checkpoint_max | |||
| args_opt.pad_mode = cfg.pad_mode | |||
| args_opt.padding = cfg.padding | |||
| args_opt.has_bias = cfg.has_bias | |||
| args_opt.batch_norm = cfg.batch_norm | |||
| args_opt.initialize_mode = cfg.initialize_mode | |||
| args_opt.has_dropout = cfg.has_dropout | |||
| args_opt.lr_epochs = list(map(int, cfg.lr_epochs.split(','))) | |||
| args_opt.image_size = list(map(int, cfg.image_size.split(','))) | |||
| return args_opt | |||
| def merge_args(args_opt, cloud_args): | |||
| """dictionary""" | |||
| args_dict = vars(args_opt) | |||
| if isinstance(cloud_args, dict): | |||
| for key_arg in cloud_args.keys(): | |||
| val = cloud_args[key_arg] | |||
| if key_arg in args_dict and val: | |||
| arg_type = type(args_dict[key_arg]) | |||
| if arg_type is not None: | |||
| val = arg_type(val) | |||
| args_dict[key_arg] = val | |||
| return args_opt | |||
| if __name__ == '__main__': | |||
| args = parse_args() | |||
| _enable_graph_kernel = args.device_target == "GPU" | |||
| def modelarts_pre_process(): | |||
| '''modelarts pre process function.''' | |||
| def unzip(zip_file, save_dir): | |||
| import zipfile | |||
| s_time = time.time() | |||
| if not os.path.exists(os.path.join(save_dir, config.modelarts_dataset_unzip_name)): | |||
| zip_isexist = zipfile.is_zipfile(zip_file) | |||
| if zip_isexist: | |||
| fz = zipfile.ZipFile(zip_file, 'r') | |||
| data_num = len(fz.namelist()) | |||
| print("Extract Start...") | |||
| print("unzip file num: {}".format(data_num)) | |||
| data_print = int(data_num / 100) if data_num > 100 else 1 | |||
| i = 0 | |||
| for file in fz.namelist(): | |||
| if i % data_print == 0: | |||
| print("unzip percent: {}%".format(int(i * 100 / data_num)), flush=True) | |||
| i += 1 | |||
| fz.extract(file, save_dir) | |||
| print("cost time: {}min:{}s.".format(int((time.time() - s_time) / 60), | |||
| int(int(time.time() - s_time) % 60))) | |||
| print("Extract Done.") | |||
| else: | |||
| print("This is not zip.") | |||
| else: | |||
| print("Zip has been extracted.") | |||
| if config.need_modelarts_dataset_unzip: | |||
| zip_file_1 = os.path.join(config.data_path, config.modelarts_dataset_unzip_name + ".zip") | |||
| save_dir_1 = os.path.join(config.data_path) | |||
| sync_lock = "/tmp/unzip_sync.lock" | |||
| # Each server contains 8 devices as most. | |||
| if config.device_target == "GPU": | |||
| device_id = get_rank() | |||
| device_num = get_group_size() | |||
| elif config.device_target == "Ascend": | |||
| device_id = get_device_id() | |||
| device_num = get_device_num() | |||
| else: | |||
| raise ValueError("Not support device_target.") | |||
| if device_id % min(device_num, 8) == 0 and not os.path.exists(sync_lock): | |||
| print("Zip file path: ", zip_file_1) | |||
| print("Unzip file save dir: ", save_dir_1) | |||
| unzip(zip_file_1, save_dir_1) | |||
| print("===Finish extract data synchronization===") | |||
| try: | |||
| os.mknod(sync_lock) | |||
| except IOError: | |||
| pass | |||
| while True: | |||
| if os.path.exists(sync_lock): | |||
| break | |||
| time.sleep(1) | |||
| print("Device: {}, Finish sync unzip data from {} to {}.".format(device_id, zip_file_1, save_dir_1)) | |||
| config.ckpt_path = os.path.join(config.output_path, config.ckpt_path) | |||
| @moxing_wrapper(pre_process=modelarts_pre_process) | |||
| def run_train(): | |||
| '''run train''' | |||
| config.lr_epochs = list(map(int, config.lr_epochs.split(','))) | |||
| config.image_size = list(map(int, config.image_size.split(','))) | |||
| config.per_batch_size = config.batch_size | |||
| _enable_graph_kernel = config.device_target == "GPU" | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| enable_graph_kernel=_enable_graph_kernel, device_target=args.device_target) | |||
| device_num = int(os.environ.get("DEVICE_NUM", 1)) | |||
| if args.is_distributed: | |||
| if args.device_target == "Ascend": | |||
| enable_graph_kernel=_enable_graph_kernel, device_target=config.device_target) | |||
| config.rank = get_rank_id() | |||
| config.device_id = get_device_id() | |||
| config.group_size = get_device_num() | |||
| if config.is_distributed: | |||
| if config.device_target == "Ascend": | |||
| init() | |||
| context.set_context(device_id=args.device_id) | |||
| elif args.device_target == "GPU": | |||
| context.set_context(device_id=config.device_id) | |||
| elif config.device_target == "GPU": | |||
| init() | |||
| args.rank = get_rank() | |||
| args.group_size = get_group_size() | |||
| device_num = args.group_size | |||
| device_num = config.group_size | |||
| context.reset_auto_parallel_context() | |||
| context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | |||
| gradients_mean=True, all_reduce_fusion_config=[2, 18]) | |||
| else: | |||
| if args.device_target == "Ascend": | |||
| context.set_context(device_id=args.device_id) | |||
| if config.device_target == "Ascend": | |||
| context.set_context(device_id=config.device_id) | |||
| # select for master rank save ckpt or all rank save, compatible for model parallel | |||
| args.rank_save_ckpt_flag = 0 | |||
| if args.is_save_on_master: | |||
| if args.rank == 0: | |||
| args.rank_save_ckpt_flag = 1 | |||
| config.rank_save_ckpt_flag = 0 | |||
| if config.is_save_on_master: | |||
| if config.rank == 0: | |||
| config.rank_save_ckpt_flag = 1 | |||
| else: | |||
| args.rank_save_ckpt_flag = 1 | |||
| config.rank_save_ckpt_flag = 1 | |||
| # logger | |||
| args.outputs_dir = os.path.join(args.ckpt_path, | |||
| datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) | |||
| args.logger = get_logger(args.outputs_dir, args.rank) | |||
| config.outputs_dir = os.path.join(config.ckpt_path, | |||
| datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) | |||
| config.logger = get_logger(config.outputs_dir, config.rank) | |||
| if args.dataset == "cifar10": | |||
| dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size) | |||
| if config.dataset == "cifar10": | |||
| dataset = vgg_create_dataset(config.data_dir, config.image_size, config.per_batch_size, | |||
| config.rank, config.group_size) | |||
| else: | |||
| dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, | |||
| args.rank, args.group_size) | |||
| dataset = classification_dataset(config.data_dir, config.image_size, config.per_batch_size, | |||
| config.rank, config.group_size) | |||
| batch_num = dataset.get_dataset_size() | |||
| args.steps_per_epoch = dataset.get_dataset_size() | |||
| args.logger.save_args(args) | |||
| config.steps_per_epoch = dataset.get_dataset_size() | |||
| config.logger.save_args(config) | |||
| # network | |||
| args.logger.important_info('start create network') | |||
| config.logger.important_info('start create network') | |||
| # get network and init | |||
| network = vgg16(args.num_classes, args) | |||
| network = vgg16(config.num_classes, config) | |||
| # pre_trained | |||
| if args.pre_trained: | |||
| load_param_into_net(network, load_checkpoint(args.pre_trained)) | |||
| if config.pre_trained: | |||
| load_param_into_net(network, load_checkpoint(config.pre_trained)) | |||
| # lr scheduler | |||
| if args.lr_scheduler == 'exponential': | |||
| lr = warmup_step_lr(args.lr, | |||
| args.lr_epochs, | |||
| args.steps_per_epoch, | |||
| args.warmup_epochs, | |||
| args.max_epoch, | |||
| gamma=args.lr_gamma, | |||
| if config.lr_scheduler == 'exponential': | |||
| lr = warmup_step_lr(config.lr, | |||
| config.lr_epochs, | |||
| config.steps_per_epoch, | |||
| config.warmup_epochs, | |||
| config.max_epoch, | |||
| gamma=config.lr_gamma, | |||
| ) | |||
| elif args.lr_scheduler == 'cosine_annealing': | |||
| lr = warmup_cosine_annealing_lr(args.lr, | |||
| args.steps_per_epoch, | |||
| args.warmup_epochs, | |||
| args.max_epoch, | |||
| args.T_max, | |||
| args.eta_min) | |||
| elif args.lr_scheduler == 'step': | |||
| lr = lr_steps(0, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs, | |||
| total_epochs=args.max_epoch, steps_per_epoch=batch_num) | |||
| elif config.lr_scheduler == 'cosine_annealing': | |||
| lr = warmup_cosine_annealing_lr(config.lr, | |||
| config.steps_per_epoch, | |||
| config.warmup_epochs, | |||
| config.max_epoch, | |||
| config.T_max, | |||
| config.eta_min) | |||
| elif config.lr_scheduler == 'step': | |||
| lr = lr_steps(0, lr_init=config.lr_init, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, | |||
| total_epochs=config.max_epoch, steps_per_epoch=batch_num) | |||
| else: | |||
| raise NotImplementedError(args.lr_scheduler) | |||
| raise NotImplementedError(config.lr_scheduler) | |||
| # optimizer | |||
| opt = Momentum(params=get_param_groups(network), | |||
| learning_rate=Tensor(lr), | |||
| momentum=args.momentum, | |||
| weight_decay=args.weight_decay, | |||
| loss_scale=args.loss_scale) | |||
| momentum=config.momentum, | |||
| weight_decay=config.weight_decay, | |||
| loss_scale=config.loss_scale) | |||
| if args.dataset == "cifar10": | |||
| if config.dataset == "cifar10": | |||
| loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||
| model = Model(network, loss_fn=loss, optimizer=opt, metrics={'acc'}, | |||
| amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None) | |||
| else: | |||
| if not args.label_smooth: | |||
| args.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) | |||
| if not config.label_smooth: | |||
| config.label_smooth_factor = 0.0 | |||
| loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.num_classes) | |||
| loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) | |||
| loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) | |||
| model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2") | |||
| # define callbacks | |||
| time_cb = TimeMonitor(data_size=batch_num) | |||
| loss_cb = LossMonitor(per_print_times=batch_num) | |||
| callbacks = [time_cb, loss_cb] | |||
| if args.rank_save_ckpt_flag: | |||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, | |||
| keep_checkpoint_max=args.ckpt_save_max) | |||
| save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') | |||
| if config.rank_save_ckpt_flag: | |||
| ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval * config.steps_per_epoch, | |||
| keep_checkpoint_max=config.keep_checkpoint_max) | |||
| save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/') | |||
| ckpt_cb = ModelCheckpoint(config=ckpt_config, | |||
| directory=save_ckpt_path, | |||
| prefix='{}'.format(args.rank)) | |||
| prefix='{}'.format(config.rank)) | |||
| callbacks.append(ckpt_cb) | |||
| model.train(args.max_epoch, dataset, callbacks=callbacks) | |||
| model.train(config.max_epoch, dataset, callbacks=callbacks) | |||
| if __name__ == '__main__': | |||
| run_train() | |||
| @@ -18,9 +18,10 @@ import pytest | |||
| from mindspore import Tensor | |||
| from model_zoo.official.cv.vgg16.src.vgg import vgg16 | |||
| from model_zoo.official.cv.vgg16.src.config import cifar_cfg as cfg | |||
| from model_zoo.official.cv.vgg16.model_utils.config import get_config_static | |||
| from ..ut_filter import non_graph_engine | |||
| cfg = get_config_static() | |||
| @non_graph_engine | |||
| def test_vgg16(): | |||