Browse Source

fix md device_num

tags/v1.1.0
VectorSL 5 years ago
parent
commit
c6b5a7db03
1 changed files with 26 additions and 7 deletions
  1. +26
    -7
      model_zoo/official/cv/resnet/gpu_resnet_benchmark.py

+ 26
- 7
model_zoo/official/cv/resnet/gpu_resnet_benchmark.py View File

@@ -22,7 +22,7 @@ from mindspore import Tensor
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model
from mindspore.context import ParallelMode
from mindspore.train.callback import Callback, LossMonitor, ModelCheckpoint, CheckpointConfig
from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.train.serialization import load_checkpoint, load_param_into_net
@@ -59,13 +59,33 @@ class MyTimeMonitor(Callback):
def step_begin(self, run_context):
self.step_time = time.time()
def step_end(self, run_context):
cb_params = run_context.original_args()
loss = cb_params.net_outputs

if isinstance(loss, (tuple, list)):
if isinstance(loss[0], Tensor) and isinstance(loss[0].asnumpy(), np.ndarray):
loss = loss[0]

if isinstance(loss, Tensor) and isinstance(loss.asnumpy(), np.ndarray):
loss = np.mean(loss.asnumpy())

cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1

if isinstance(loss, float) and (np.isnan(loss) or np.isinf(loss)):
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
cb_params.cur_epoch_num, cur_step_in_epoch))
step_mseconds = (time.time() - self.step_time) * 1000
fps = self.batch_size / step_mseconds *1000 * self.size
print("Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True, end=" ")

def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16"):
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss),
"Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True)

def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
device_num=1):
if device_num == 1:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
else:
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
num_shards=device_num, shard_id=get_rank())
image_size = 224
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
@@ -185,8 +205,7 @@ def train():
if mode == context.PYNATIVE_MODE:
print_per_steps = 1
time_cb = MyTimeMonitor(total_batch, print_per_steps)
loss_cb = LossMonitor()
cb = [time_cb, loss_cb]
cb = [time_cb]
if save_ckpt:
config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=5)
ckpt_cb = ModelCheckpoint(prefix="resnet_benchmark", directory=ckpt_save_dir, config=config_ck)


Loading…
Cancel
Save