""" 示例选用的数据集是MnistDataset_mindspore.zip 数据集结构是: MnistDataset_mindspore.zip ├── test │ ├── t10k-images-idx3-ubyte │ └── t10k-labels-idx1-ubyte └── train ├── train-images-idx3-ubyte └── train-labels-idx1-ubyte 使用注意事项: 1、在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 2、用户需要调用c2net的python sdk包 """ import os import argparse from config import mnist_cfg as cfg from dataset_distributed import create_dataset_parallel from lenet import LeNet5 import mindspore.nn as nn from mindspore import context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train import Model from mindspore.context import ParallelMode from mindspore.communication.management import init, get_rank import time from c2net.context import prepare, upload_output parser = argparse.ArgumentParser(description='MindSpore Lenet Example') parser.add_argument( '--device_target', type=str, default="Ascend", choices=['Ascend', 'CPU'], help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU') parser.add_argument('--epoch_size', type=int, default=5, help='Training epochs.') if __name__ == "__main__": ###请在代码中加入args, unknown = parser.parse_known_args(),可忽略掉--ckpt_url参数报错等参数问题 args, unknown = parser.parse_known_args() device_num = int(os.getenv('RANK_SIZE')) #使用多卡时 # set device_id and init for multi-card training context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) init() #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data local_rank=int(os.getenv('RANK_ID')) #初始化导入数据集和预训练模型到容器内,并行任务先让0卡拷贝数据,并用一个缓存文件标记0卡已prepare完成 if local_rank == 0: c2net_context = prepare() f = open("/cache/prepare_completed.txt", 'w') f.close() try: if os.path.exists("/cache/prepare_completed.txt"): print("prepare completed!") except Exception as e: print("prepare failed") while not os.path.exists("/cache/prepare_completed.txt"): time.sleep(1) c2net_context = prepare() #获取数据集路径 MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore" output_path = c2net_context.output_path ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) if args.device_target != "Ascend": model = Model(network, net_loss, net_opt, metrics={"accuracy"}) else: model = Model(network, net_loss, net_opt, metrics={"accuracy"}, amp_level="O2") config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) #Note that this method saves the model file on each card. You need to specify the save path on each card. # In this example, get_rank() is added to distinguish different paths. outputDirectory = output_path + "/" + str(get_rank()) + "/" ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=outputDirectory, config=config_ck) print("============== Starting Training ==============") epoch_size = cfg['epoch_size'] if (args.epoch_size): epoch_size = args.epoch_size print('epoch_size is: ', epoch_size) model.train(epoch_size, ds_train,callbacks=[time_cb, ckpoint_cb,LossMonitor()]) ###上传训练结果到启智平台,注意必须将要输出的模型存储在c2net_context.output_path upload_output()