Browse Source

!6137 modify the model_zoo ckpt path

Merge pull request !6137 from TuDouNi/master
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
8b0793eb84
31 changed files with 58 additions and 37 deletions
  1. +1
    -1
      model_zoo/official/cv/faster_rcnn/src/config.py
  2. +2
    -1
      model_zoo/official/cv/faster_rcnn/train.py
  3. +4
    -2
      model_zoo/official/cv/googlenet/train.py
  4. +2
    -2
      model_zoo/official/cv/inceptionv3/src/config.py
  5. +2
    -1
      model_zoo/official/cv/inceptionv3/train.py
  6. +1
    -1
      model_zoo/official/cv/maskrcnn/src/config.py
  7. +2
    -1
      model_zoo/official/cv/maskrcnn/train.py
  8. +2
    -1
      model_zoo/official/cv/mobilenetv2/src/config.py
  9. +4
    -6
      model_zoo/official/cv/mobilenetv2/src/utils.py
  10. +1
    -1
      model_zoo/official/cv/nasnet/src/config.py
  11. +2
    -1
      model_zoo/official/cv/nasnet/train.py
  12. +1
    -1
      model_zoo/official/cv/resnet/train.py
  13. +1
    -1
      model_zoo/official/cv/resnet_thor/train.py
  14. +2
    -1
      model_zoo/official/cv/resnext50/train.py
  15. +1
    -1
      model_zoo/official/cv/shufflenetv2/src/config.py
  16. +2
    -1
      model_zoo/official/cv/shufflenetv2/train.py
  17. +2
    -1
      model_zoo/official/cv/ssd/train.py
  18. +2
    -1
      model_zoo/official/cv/vgg16/train.py
  19. +1
    -1
      model_zoo/official/cv/warpctc/src/config.py
  20. +2
    -1
      model_zoo/official/cv/warpctc/train.py
  21. +2
    -1
      model_zoo/official/cv/yolov3_darknet53/train.py
  22. +2
    -1
      model_zoo/official/cv/yolov3_darknet53_quant/train.py
  23. +1
    -1
      model_zoo/official/cv/yolov3_resnet18/train.py
  24. +2
    -1
      model_zoo/official/nlp/bert/run_pretrain.py
  25. +2
    -1
      model_zoo/official/nlp/bert_thor/run_pretrain.py
  26. +4
    -1
      model_zoo/official/nlp/transformer/train.py
  27. +1
    -0
      model_zoo/official/recommend/deepfm/train.py
  28. +1
    -1
      model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py
  29. +2
    -1
      model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py
  30. +2
    -1
      model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py
  31. +2
    -1
      model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py

+ 1
- 1
model_zoo/official/cv/faster_rcnn/src/config.py View File

@@ -132,7 +132,7 @@ config = ed({
"save_checkpoint": True, "save_checkpoint": True,
"save_checkpoint_epochs": 1, "save_checkpoint_epochs": 1,
"keep_checkpoint_max": 10, "keep_checkpoint_max": 10,
"save_checkpoint_path": "./checkpoint",
"save_checkpoint_path": "./",


"mindrecord_dir": "../MindRecord_COCO_TRAIN", "mindrecord_dir": "../MindRecord_COCO_TRAIN",
"coco_root": "./cocodataset/", "coco_root": "./cocodataset/",


+ 2
- 1
model_zoo/official/cv/faster_rcnn/train.py View File

@@ -136,7 +136,8 @@ if __name__ == '__main__':
if config.save_checkpoint: if config.save_checkpoint:
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
keep_checkpoint_max=config.keep_checkpoint_max) keep_checkpoint_max=config.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix='faster_rcnn', directory=config.save_checkpoint_path, config=ckptconfig)
save_checkpoint_path = os.path.join(config.save_checkpoint_path, "ckpt_" + str(rank) + "/")
ckpoint_cb = ModelCheckpoint(prefix='faster_rcnn', directory=save_checkpoint_path, config=ckptconfig)
cb += [ckpoint_cb] cb += [ckpoint_cb]


model = Model(net) model = Model(net)


+ 4
- 2
model_zoo/official/cv/googlenet/train.py View File

@@ -106,6 +106,7 @@ if __name__ == '__main__':
context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target) context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target)
device_num = int(os.environ.get("DEVICE_NUM", 1)) device_num = int(os.environ.get("DEVICE_NUM", 1))


rank = 0
if device_target == "Ascend": if device_target == "Ascend":
if args_opt.device_id is not None: if args_opt.device_id is not None:
context.set_context(device_id=args_opt.device_id) context.set_context(device_id=args_opt.device_id)
@@ -117,6 +118,7 @@ if __name__ == '__main__':
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True) gradients_mean=True)
init() init()
rank = get_rank()
elif device_target == "GPU": elif device_target == "GPU":
init() init()


@@ -124,6 +126,7 @@ if __name__ == '__main__':
context.reset_auto_parallel_context() context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True) gradients_mean=True)
rank = get_rank()
else: else:
raise ValueError("Unsupported platform.") raise ValueError("Unsupported platform.")


@@ -200,14 +203,13 @@ if __name__ == '__main__':
if device_target == "Ascend": if device_target == "Ascend":
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager) amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager)
ckpt_save_dir = "./"
else: # GPU else: # GPU
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=True, loss_scale_manager=loss_scale_manager) amp_level="O2", keep_batchnorm_fp32=True, loss_scale_manager=loss_scale_manager)
ckpt_save_dir = "./ckpt_" + str(get_rank()) + "/"


config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num) time_cb = TimeMonitor(data_size=batch_num)
ckpt_save_dir = "./ckpt_" + str(rank) + "/"
ckpoint_cb = ModelCheckpoint(prefix="train_googlenet_" + args_opt.dataset_name, directory=ckpt_save_dir, ckpoint_cb = ModelCheckpoint(prefix="train_googlenet_" + args_opt.dataset_name, directory=ckpt_save_dir,
config=config_ck) config=config_ck)
loss_cb = LossMonitor() loss_cb = LossMonitor()


+ 2
- 2
model_zoo/official/cv/inceptionv3/src/config.py View File

@@ -38,7 +38,7 @@ config_gpu = edict({
'momentum': 0.9, 'momentum': 0.9,
'opt_eps': 1.0, 'opt_eps': 1.0,
'keep_checkpoint_max': 10, 'keep_checkpoint_max': 10,
'ckpt_path': './checkpoint/',
'ckpt_path': './',
'is_save_on_master': 0, 'is_save_on_master': 0,
'dropout_keep_prob': 0.5, 'dropout_keep_prob': 0.5,
'has_bias': True, 'has_bias': True,
@@ -65,7 +65,7 @@ config_ascend = edict({
'momentum': 0.9, 'momentum': 0.9,
'opt_eps': 1.0, 'opt_eps': 1.0,
'keep_checkpoint_max': 10, 'keep_checkpoint_max': 10,
'ckpt_path': './checkpoint/',
'ckpt_path': './',
'is_save_on_master': 0, 'is_save_on_master': 0,
'dropout_keep_prob': 0.8, 'dropout_keep_prob': 0.8,
'has_bias': False, 'has_bias': False,


+ 2
- 1
model_zoo/official/cv/inceptionv3/train.py View File

@@ -115,7 +115,8 @@ if __name__ == '__main__':
time_cb = TimeMonitor(data_size=batches_per_epoch) time_cb = TimeMonitor(data_size=batches_per_epoch)
callbacks = [loss_cb, time_cb] callbacks = [loss_cb, time_cb]
config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix=f"inceptionv3-rank{cfg.rank}", directory=cfg.ckpt_path, config=config_ck)
save_ckpt_path = os.path.join(cfg.ckpt_path, 'ckpt_' + str(cfg.rank) + '/')
ckpoint_cb = ModelCheckpoint(prefix=f"inceptionv3-rank{cfg.rank}", directory=save_ckpt_path, config=config_ck)
if args_opt.is_distributed & cfg.is_save_on_master: if args_opt.is_distributed & cfg.is_save_on_master:
if cfg.rank == 0: if cfg.rank == 0:
callbacks.append(ckpoint_cb) callbacks.append(ckpoint_cb)


+ 1
- 1
model_zoo/official/cv/maskrcnn/src/config.py View File

@@ -139,7 +139,7 @@ config = ed({
"save_checkpoint": True, "save_checkpoint": True,
"save_checkpoint_epochs": 1, "save_checkpoint_epochs": 1,
"keep_checkpoint_max": 12, "keep_checkpoint_max": 12,
"save_checkpoint_path": "./checkpoint",
"save_checkpoint_path": "./",


"mindrecord_dir": "/home/mask_rcnn/MindRecord_COCO2017_Train", "mindrecord_dir": "/home/mask_rcnn/MindRecord_COCO2017_Train",
"coco_root": "/home/mask_rcnn/coco2017/", "coco_root": "/home/mask_rcnn/coco2017/",


+ 2
- 1
model_zoo/official/cv/maskrcnn/train.py View File

@@ -131,7 +131,8 @@ if __name__ == '__main__':
if config.save_checkpoint: if config.save_checkpoint:
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
keep_checkpoint_max=config.keep_checkpoint_max) keep_checkpoint_max=config.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix='mask_rcnn', directory=config.save_checkpoint_path, config=ckptconfig)
save_checkpoint_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
ckpoint_cb = ModelCheckpoint(prefix='mask_rcnn', directory=save_checkpoint_path, config=ckptconfig)
cb += [ckpoint_cb] cb += [ckpoint_cb]


model = Model(net) model = Model(net)


+ 2
- 1
model_zoo/official/cv/mobilenetv2/src/config.py View File

@@ -37,7 +37,8 @@ def set_config(args):
"save_checkpoint_epochs": 1, "save_checkpoint_epochs": 1,
"keep_checkpoint_max": 20, "keep_checkpoint_max": 20,
"save_checkpoint_path": "./checkpoint", "save_checkpoint_path": "./checkpoint",
"platform": args.platform
"platform": args.platform,
"run_distribute": False
}) })
config_gpu = ed({ config_gpu = ed({
"num_classes": 1000, "num_classes": 1000,


+ 4
- 6
model_zoo/official/cv/mobilenetv2/src/utils.py View File

@@ -76,14 +76,12 @@ def config_ckpoint(config, lr, step_size):
if config.save_checkpoint: if config.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
keep_checkpoint_max=config.keep_checkpoint_max) keep_checkpoint_max=config.keep_checkpoint_max)
ckpt_save_dir = config.save_checkpoint_path
if config.platform == "GPU":
if config.run_distribute:
ckpt_save_dir += "ckpt_" + str(get_rank()) + "/"
else:
ckpt_save_dir += "ckpt_" + "/"
rank = 0
if config.run_distribute:
rank = get_rank()
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(rank) + "/"
ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck)
cb += [ckpt_cb] cb += [ckpt_cb]
return cb return cb

+ 1
- 1
model_zoo/official/cv/nasnet/src/config.py View File

@@ -25,7 +25,7 @@ nasnet_a_mobile_config_gpu = edict({
'work_nums': 8, 'work_nums': 8,
'epoch_size': 312, 'epoch_size': 312,
'keep_checkpoint_max': 100, 'keep_checkpoint_max': 100,
'ckpt_path': './nasnet_a_mobile_checkpoint/',
'ckpt_path': './',
'is_save_on_master': 0, 'is_save_on_master': 0,


### Dataset Config ### Dataset Config


+ 2
- 1
model_zoo/official/cv/nasnet/train.py View File

@@ -102,7 +102,8 @@ if __name__ == '__main__':
time_cb = TimeMonitor(data_size=batches_per_epoch) time_cb = TimeMonitor(data_size=batches_per_epoch)
callbacks = [loss_cb, time_cb] callbacks = [loss_cb, time_cb]
config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix=f"nasnet-a-mobile-rank{cfg.rank}", directory=cfg.ckpt_path, config=config_ck)
save_ckpt_path = os.path.join(cfg.ckpt_path, 'ckpt_' + str(cfg.rank) + '/')
ckpoint_cb = ModelCheckpoint(prefix=f"nasnet-a-mobile-rank{cfg.rank}", directory=save_ckpt_path, config=config_ck)
if args_opt.is_distributed & cfg.is_save_on_master: if args_opt.is_distributed & cfg.is_save_on_master:
if cfg.rank == 0: if cfg.rank == 0:
callbacks.append(ckpoint_cb) callbacks.append(ckpoint_cb)


+ 1
- 1
model_zoo/official/cv/resnet/train.py View File

@@ -90,7 +90,7 @@ if __name__ == '__main__':
gradients_mean=True) gradients_mean=True)
if args_opt.net == "resnet50": if args_opt.net == "resnet50":
context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160]) context.set_auto_parallel_context(all_reduce_fusion_config=[85, 160])
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"


# create dataset # create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,


+ 1
- 1
model_zoo/official/cv/resnet_thor/train.py View File

@@ -100,7 +100,7 @@ if __name__ == '__main__':
init() init()
context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
gradients_mean=True, all_reduce_fusion_config=[107]) gradients_mean=True, all_reduce_fusion_config=[107])
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"


# create dataset # create dataset
dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1,


+ 2
- 1
model_zoo/official/cv/resnext50/train.py View File

@@ -280,8 +280,9 @@ def train(cloud_args=None):
if args.rank_save_ckpt_flag: if args.rank_save_ckpt_flag:
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
keep_checkpoint_max=args.ckpt_save_max) keep_checkpoint_max=args.ckpt_save_max)
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
ckpt_cb = ModelCheckpoint(config=ckpt_config, ckpt_cb = ModelCheckpoint(config=ckpt_config,
directory=args.outputs_dir,
directory=save_ckpt_path,
prefix='{}'.format(args.rank)) prefix='{}'.format(args.rank))
callbacks.append(ckpt_cb) callbacks.append(ckpt_cb)




+ 1
- 1
model_zoo/official/cv/shufflenetv2/src/config.py View File

@@ -25,7 +25,7 @@ config_gpu = edict({
'work_nums': 8, 'work_nums': 8,
'epoch_size': 250, 'epoch_size': 250,
'keep_checkpoint_max': 100, 'keep_checkpoint_max': 100,
'ckpt_path': './checkpoint/',
'ckpt_path': './',
'is_save_on_master': 0, 'is_save_on_master': 0,


### Dataset Config ### Dataset Config


+ 2
- 1
model_zoo/official/cv/shufflenetv2/train.py View File

@@ -110,7 +110,8 @@ if __name__ == '__main__':
time_cb = TimeMonitor(data_size=batches_per_epoch) time_cb = TimeMonitor(data_size=batches_per_epoch)
callbacks = [loss_cb, time_cb] callbacks = [loss_cb, time_cb]
config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpoint_cb = ModelCheckpoint(prefix=f"shufflenet-rank{cfg.rank}", directory=cfg.ckpt_path, config=config_ck)
save_ckpt_path = os.path.join(cfg.ckpt_path, 'ckpt_' + str(cfg.rank) + '/')
ckpoint_cb = ModelCheckpoint(prefix=f"shufflenet-rank{cfg.rank}", directory=save_ckpt_path, config=config_ck)
if args_opt.is_distributed & cfg.is_save_on_master: if args_opt.is_distributed & cfg.is_save_on_master:
if cfg.rank == 0: if cfg.rank == 0:
callbacks.append(ckpoint_cb) callbacks.append(ckpoint_cb)


+ 2
- 1
model_zoo/official/cv/ssd/train.py View File

@@ -118,7 +118,8 @@ def main():


# checkpoint # checkpoint
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config)
save_ckpt_path = './ckpt_' + str(rank) + '/'
ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config)


if args_opt.pre_trained: if args_opt.pre_trained:
if args_opt.pre_trained_epoch_size <= 0: if args_opt.pre_trained_epoch_size <= 0:


+ 2
- 1
model_zoo/official/cv/vgg16/train.py View File

@@ -226,8 +226,9 @@ if __name__ == '__main__':
if args.rank_save_ckpt_flag: if args.rank_save_ckpt_flag:
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
keep_checkpoint_max=args.ckpt_save_max) keep_checkpoint_max=args.ckpt_save_max)
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
ckpt_cb = ModelCheckpoint(config=ckpt_config, ckpt_cb = ModelCheckpoint(config=ckpt_config,
directory=args.outputs_dir,
directory=save_ckpt_path,
prefix='{}'.format(args.rank)) prefix='{}'.format(args.rank))
callbacks.append(ckpt_cb) callbacks.append(ckpt_cb)




+ 1
- 1
model_zoo/official/cv/warpctc/src/config.py View File

@@ -27,5 +27,5 @@ config = EasyDict({
"save_checkpoint": True, "save_checkpoint": True,
"save_checkpoint_steps": 97, "save_checkpoint_steps": 97,
"keep_checkpoint_max": 30, "keep_checkpoint_max": 30,
"save_checkpoint_path": "./checkpoint",
"save_checkpoint_path": "./",
}) })

+ 2
- 1
model_zoo/official/cv/warpctc/train.py View File

@@ -98,6 +98,7 @@ if __name__ == '__main__':
if cf.save_checkpoint: if cf.save_checkpoint:
config_ck = CheckpointConfig(save_checkpoint_steps=cf.save_checkpoint_steps, config_ck = CheckpointConfig(save_checkpoint_steps=cf.save_checkpoint_steps,
keep_checkpoint_max=cf.keep_checkpoint_max) keep_checkpoint_max=cf.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="warpctc", directory=cf.save_checkpoint_path + str(rank), config=config_ck)
save_ckpt_path = os.path.join(cf.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
ckpt_cb = ModelCheckpoint(prefix="warpctc", directory=save_ckpt_path, config=config_ck)
callbacks.append(ckpt_cb) callbacks.append(ckpt_cb)
model.train(cf.epoch_size, dataset, callbacks=callbacks) model.train(cf.epoch_size, dataset, callbacks=callbacks)

+ 2
- 1
model_zoo/official/cv/yolov3_darknet53/train.py View File

@@ -287,8 +287,9 @@ def train():
ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
keep_checkpoint_max=ckpt_max_num) keep_checkpoint_max=ckpt_max_num)
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
ckpt_cb = ModelCheckpoint(config=ckpt_config, ckpt_cb = ModelCheckpoint(config=ckpt_config,
directory=args.outputs_dir,
directory=save_ckpt_path,
prefix='{}'.format(args.rank)) prefix='{}'.format(args.rank))
cb_params = _InternalCallbackParam() cb_params = _InternalCallbackParam()
cb_params.train_network = network cb_params.train_network = network


+ 2
- 1
model_zoo/official/cv/yolov3_darknet53_quant/train.py View File

@@ -291,8 +291,9 @@ def train():
ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
keep_checkpoint_max=ckpt_max_num) keep_checkpoint_max=ckpt_max_num)
save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
ckpt_cb = ModelCheckpoint(config=ckpt_config, ckpt_cb = ModelCheckpoint(config=ckpt_config,
directory=args.outputs_dir,
directory=save_ckpt_path,
prefix='{}'.format(args.rank)) prefix='{}'.format(args.rank))
cb_params = _InternalCallbackParam() cb_params = _InternalCallbackParam()
cb_params.train_network = network cb_params.train_network = network


+ 1
- 1
model_zoo/official/cv/yolov3_resnet18/train.py View File

@@ -137,7 +137,7 @@ def main():


# checkpoint # checkpoint
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory='./ckpt_' + str(rank) + '/', config=ckpt_config)


if args_opt.pre_trained: if args_opt.pre_trained:
if args_opt.pre_trained_epoch_size <= 0: if args_opt.pre_trained_epoch_size <= 0:


+ 2
- 1
model_zoo/official/nlp/bert/run_pretrain.py View File

@@ -20,6 +20,7 @@ python run_pretrain.py
import os import os
import argparse import argparse
import mindspore.communication.management as D import mindspore.communication.management as D
from mindspore.communication.management import get_rank
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore import context from mindspore import context
from mindspore.train.model import Model from mindspore.train.model import Model
@@ -82,7 +83,7 @@ def run_pretrain():
D.init() D.init()
device_num = D.get_group_size() device_num = D.get_group_size()
rank = D.get_rank() rank = D.get_rank()
ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/'
ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'


context.reset_auto_parallel_context() context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,


+ 2
- 1
model_zoo/official/nlp/bert_thor/run_pretrain.py View File

@@ -28,6 +28,7 @@ from src.model_thor import Model
from src.utils import LossCallBack, BertLearningRate from src.utils import LossCallBack, BertLearningRate
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.communication.management as D import mindspore.communication.management as D
from mindspore.communication.management import get_rank
from mindspore import context from mindspore import context
from mindspore import log as logger from mindspore import log as logger
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecay
@@ -84,7 +85,7 @@ def run_pretrain():
D.init() D.init()
device_num = D.get_group_size() device_num = D.get_group_size()
rank = D.get_rank() rank = D.get_rank()
ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/'
ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'


context.reset_auto_parallel_context() context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,


+ 4
- 1
model_zoo/official/nlp/transformer/train.py View File

@@ -14,6 +14,7 @@
# ============================================================================ # ============================================================================
"""Transformer training script.""" """Transformer training script."""


import os
import time import time
import argparse import argparse
import ast import ast
@@ -27,6 +28,7 @@ from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
from mindspore.train.callback import Callback, TimeMonitor from mindspore.train.callback import Callback, TimeMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.communication.management as D import mindspore.communication.management as D
from mindspore.communication.management import get_rank
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
from mindspore import context from mindspore import context
from mindspore.common import set_seed from mindspore.common import set_seed
@@ -125,6 +127,7 @@ def run_transformer_train():
parameter_broadcast=True, device_num=device_num) parameter_broadcast=True, device_num=device_num)
D.init() D.init()
rank_id = args.device_id % device_num rank_id = args.device_id % device_num
save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/')
else: else:
device_num = 1 device_num = 1
rank_id = 0 rank_id = 0
@@ -153,7 +156,7 @@ def run_transformer_train():
if device_num == 1 or (device_num > 1 and rank_id == 0): if device_num == 1 or (device_num > 1 and rank_id == 0):
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
keep_checkpoint_max=args.save_checkpoint_num) keep_checkpoint_max=args.save_checkpoint_num)
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config)
ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config)
callbacks.append(ckpoint_cb) callbacks.append(ckpoint_cb)


if args.enable_lossscale == "true": if args.enable_lossscale == "true":


+ 1
- 0
model_zoo/official/recommend/deepfm/train.py View File

@@ -104,6 +104,7 @@ if __name__ == '__main__':
if train_config.save_checkpoint: if train_config.save_checkpoint:
if rank_size: if rank_size:
train_config.ckpt_file_name_prefix = train_config.ckpt_file_name_prefix + str(get_rank()) train_config.ckpt_file_name_prefix = train_config.ckpt_file_name_prefix + str(get_rank())
args_opt.ckpt_path = os.path.join(args_opt.ckpt_path, 'ckpt_' + str(get_rank()) + '/')
if args_opt.device_target == "GPU": if args_opt.device_target == "GPU":
config_ck = CheckpointConfig(save_checkpoint_steps=steps_size, config_ck = CheckpointConfig(save_checkpoint_steps=steps_size,
keep_checkpoint_max=train_config.keep_checkpoint_max) keep_checkpoint_max=train_config.keep_checkpoint_max)


+ 1
- 1
model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py View File

@@ -127,7 +127,7 @@ def train_and_eval(config):
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
keep_checkpoint_max=5, integrated_save=False) keep_checkpoint_max=5, integrated_save=False)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path, config=ckptconfig)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
callback_list = [TimeMonitor( callback_list = [TimeMonitor(
ds_train.get_dataset_size()), eval_callback, callback] ds_train.get_dataset_size()), eval_callback, callback]


+ 2
- 1
model_zoo/official/recommend/wide_and_deep/train_and_eval_distribute.py View File

@@ -102,7 +102,8 @@ def train_and_eval(config):
callback = LossCallBack(config=config) callback = LossCallBack(config=config)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path, config=ckptconfig)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
out = model.eval(ds_eval) out = model.eval(ds_eval)
print("=====" * 5 + "model.eval() initialized: {}".format(out)) print("=====" * 5 + "model.eval() initialized: {}".format(out))
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]


+ 2
- 1
model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py View File

@@ -104,7 +104,8 @@ def train_and_eval(config):
callback = LossCallBack(config=config) callback = LossCallBack(config=config)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path, config=ckptconfig)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if get_rank() == 0: if get_rank() == 0:
callback_list.append(ckpoint_cb) callback_list.append(ckpoint_cb)


+ 2
- 1
model_zoo/official/recommend/wide_and_deep_multitable/train_and_eval_distribute.py View File

@@ -98,7 +98,8 @@ def train_and_eval(config):
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs, ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*config.epochs,
keep_checkpoint_max=10) keep_checkpoint_max=10)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path, config=ckptconfig)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if int(get_rank()) == 0: if int(get_rank()) == 0:
callback_list.append(ckpoint_cb) callback_list.append(ckpoint_cb)


Loading…
Cancel
Save