From f1cec60dc85f63bb2745f221888940f7745f38b2 Mon Sep 17 00:00:00 2001 From: meixiaowei Date: Sun, 26 Apr 2020 15:35:11 +0800 Subject: [PATCH 1/3] upload resnet101 scripts --- example/resnet101_imagenet/README.md | 139 +++++++++++++ example/resnet101_imagenet/config.py | 42 ++++ example/resnet101_imagenet/crossentropy.py | 36 ++++ example/resnet101_imagenet/dataset.py | 89 +++++++++ example/resnet101_imagenet/eval.py | 84 ++++++++ example/resnet101_imagenet/lr_generator.py | 113 +++++++++++ .../run_distribute_train.sh | 54 ++++++ example/resnet101_imagenet/run_infer.sh | 52 +++++ .../run_standalone_train.sh | 46 +++++ example/resnet101_imagenet/train.py | 113 +++++++++++ example/resnet101_imagenet/var_init.py | 183 ++++++++++++++++++ mindspore/model_zoo/resnet.py | 21 ++ 12 files changed, 972 insertions(+) create mode 100644 example/resnet101_imagenet/README.md create mode 100755 example/resnet101_imagenet/config.py create mode 100755 example/resnet101_imagenet/crossentropy.py create mode 100755 example/resnet101_imagenet/dataset.py create mode 100755 example/resnet101_imagenet/eval.py create mode 100755 example/resnet101_imagenet/lr_generator.py create mode 100755 example/resnet101_imagenet/run_distribute_train.sh create mode 100755 example/resnet101_imagenet/run_infer.sh create mode 100755 example/resnet101_imagenet/run_standalone_train.sh create mode 100755 example/resnet101_imagenet/train.py create mode 100755 example/resnet101_imagenet/var_init.py diff --git a/example/resnet101_imagenet/README.md b/example/resnet101_imagenet/README.md new file mode 100644 index 0000000000..bc653675f2 --- /dev/null +++ b/example/resnet101_imagenet/README.md @@ -0,0 +1,139 @@ +# ResNet101 Example + +## Description + +This is an example of training ResNet101 with ImageNet dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset [ImageNet](http://image-net.org/download). + +> Unzip the ImageNet dataset to any path you want, the folder should include train and eval dataset as follows: + +``` +. +└─dataset + ├─ilsvrc + │ + └─validation_preprocess +``` + +## Example structure + +```shell +. +├── crossentropy.py # CrossEntropy loss function +├── var_init.py # weight initial +├── config.py # parameter configuration +├── dataset.py # data preprocessing +├── eval.py # eval net +├── lr_generator.py # generate learning rate +├── run_distribute_train.sh # launch distributed training(8p) +├── run_infer.sh # launch evaluating +├── run_standalone_train.sh # launch standalone training(1p) +└── train.py # train net +``` + +## Parameter configuration + +Parameters for both training and evaluating can be set in config.py. + +``` +"class_num": 1001, # dataset class number +"batch_size": 32, # batch size of input tensor +"loss_scale": 1024, # loss scale +"momentum": 0.9, # momentum optimizer +"weight_decay": 1e-4, # weight decay +"epoch_size": 120, # epoch sizes for training +"buffer_size": 1000, # number of queue size in data preprocessing +"image_height": 224, # image height +"image_width": 224, # image width +"save_checkpoint": True, # whether save checkpoint or not +"save_checkpoint_steps": 500, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step +"keep_checkpoint_max": 40, # only keep the last keep_checkpoint_max checkpoint +"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path +"lr_init": 0.01, # initial learning rate +"lr_end": 0.00001, # final learning rate +"lr_max": 0.1, # maximum learning rate +"warmup_epochs": 0, # number of warmup epoch +"lr_decay_mode": "cosine" # decay mode for generating learning rate +"label_smooth": 1, # label_smooth +"label_smooth_factor": 0.1, # label_smooth_factor +"lr": 0.1 # base learning rate +``` + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] + +# standalone training +sh run_standalone_train.sh [DATASET_PATH] +``` + +#### Launch + +```bash +# distributed training example(8p) +sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc + +# standalone training example(1p) +sh run_standalone_train.sh dataset/ilsvrc +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in log. + + +``` +# distribute training result(8p) +epoch: 1 step: 5004, loss is 4.805483 +epoch: 2 step: 5004, loss is 3.2121816 +epoch: 3 step: 5004, loss is 3.429647 +epoch: 4 step: 5004, loss is 3.3667371 +epoch: 5 step: 5004, loss is 3.1718972 +... +epoch: 67 step: 5004, loss is 2.2768745 +epoch: 68 step: 5004, loss is 1.7223864 +epoch: 69 step: 5004, loss is 2.0665488 +epoch: 70 step: 5004, loss is 1.8717369 +... +``` + +### Infer + +#### Usage + +``` +# infer +sh run_infer.sh [VALIDATION_DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +```bash +# infer with checkpoint +sh run_infer.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.ckpt + +``` + +> checkpoint can be produced in training process. + + +#### Result + +Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log. + +``` +result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199744} ckpt=train_parallel0/resnet-120_5004.ckpt +``` diff --git a/example/resnet101_imagenet/config.py b/example/resnet101_imagenet/config.py new file mode 100755 index 0000000000..0ad37c8678 --- /dev/null +++ b/example/resnet101_imagenet/config.py @@ -0,0 +1,42 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py and eval.py +""" +from easydict import EasyDict as ed + +config = ed({ + "class_num": 1001, + "batch_size": 32, + "loss_scale": 1024, + "momentum": 0.9, + "weight_decay": 1e-4, + "epoch_size": 120, + "buffer_size": 1000, + "image_height": 224, + "image_width": 224, + "save_checkpoint": True, + "save_checkpoint_steps": 500, + "keep_checkpoint_max": 40, + "save_checkpoint_path": "./", + "lr_init": 0.01, + "lr_end": 0.00001, + "lr_max": 0.1, + "warmup_epochs": 0, + "lr_decay_mode": "cosine", + "label_smooth": 1, + "label_smooth_factor": 0.1, + "lr": 0.1 +}) diff --git a/example/resnet101_imagenet/crossentropy.py b/example/resnet101_imagenet/crossentropy.py new file mode 100755 index 0000000000..e636b8529e --- /dev/null +++ b/example/resnet101_imagenet/crossentropy.py @@ -0,0 +1,36 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore import Tensor +from mindspore.common import dtype as mstype +import mindspore.nn as nn + +"""define loss function for network""" +class CrossEntropy(_Loss): + def __init__(self, smooth_factor=0., num_classes=1001): + super(CrossEntropy, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes -1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits() + self.mean = P.ReduceMean(False) + + def construct(self, logit, label): + one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss = self.ce(logit, one_hot_label) + loss = self.mean(loss, 0) + return loss diff --git a/example/resnet101_imagenet/dataset.py b/example/resnet101_imagenet/dataset.py new file mode 100755 index 0000000000..920e1c093c --- /dev/null +++ b/example/resnet101_imagenet/dataset.py @@ -0,0 +1,89 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +create train or eval dataset. +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine as de +import mindspore.dataset.transforms.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 +from config import config + +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): + """ + create a train or evaluate dataset + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + + Returns: + dataset + """ + device_num = int(os.getenv("RANK_SIZE")) + rank_id = int(os.getenv("RANK_ID")) + + if device_num == 1: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + resize_height = 224 + rescale = 1.0 / 255.0 + shift = 0.0 + + # define map operations + decode_op = C.Decode() + + random_resize_crop_op = C.RandomResizedCrop(resize_height, (0.08, 1.0), (0.75, 1.33), max_attempts=100) + horizontal_flip_op = C.RandomHorizontalFlip(rank_id / (rank_id + 1)) + resize_op_256 = C.Resize((256, 256)) + center_crop = C.CenterCrop(224) + rescale_op = C.Rescale(rescale, shift) + normalize_op = C.Normalize((0.475, 0.451, 0.392), (0.275, 0.267, 0.278)) + changeswap_op = C.HWC2CHW() + + trans=[] + if do_train: + trans = [decode_op, + random_resize_crop_op, + horizontal_flip_op, + rescale_op, + normalize_op, + changeswap_op] + + else: + trans = [decode_op, + resize_op_256, + center_crop, + rescale_op, + normalize_op, + changeswap_op] + + type_cast_op = C2.TypeCast(mstype.int32) + + ds = ds.map(input_columns="image", operations=trans) + ds = ds.map(input_columns="label", operations=type_cast_op) + + # apply shuffle operations + ds = ds.shuffle(buffer_size=config.buffer_size) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + # apply dataset repeat operation + ds = ds.repeat(repeat_num) + + return ds diff --git a/example/resnet101_imagenet/eval.py b/example/resnet101_imagenet/eval.py new file mode 100755 index 0000000000..00fe825e91 --- /dev/null +++ b/example/resnet101_imagenet/eval.py @@ -0,0 +1,84 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +eval. +""" +import os +import argparse +import random +import numpy as np +from dataset import create_dataset +from config import config +from mindspore import context +from mindspore.model_zoo.resnet import resnet101 +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.train.model import Model, ParallelMode +from mindspore.train.serialization import load_checkpoint, load_param_into_net +import mindspore.dataset.engine as de +from mindspore.communication.management import init +from crossentropy import CrossEntropy + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.') +parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) +context.set_context(enable_task_sink=True) +context.set_context(enable_loop_sink=True) +context.set_context(enable_mem_reuse=True) + +if __name__ == '__main__': + if args_opt.do_eval: + context.set_context(enable_hccl=False) + else: + if args_opt.run_distribute: + context.set_context(enable_hccl=True) + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() + else: + context.set_context(enable_hccl=False) + + epoch_size = config.epoch_size + net = resnet101(class_num=config.class_num) + + if not config.label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + if args_opt.do_eval: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + + if args_opt.checkpoint_path: + param_dict = load_checkpoint(args_opt.checkpoint_path) + load_param_into_net(net, param_dict) + net.set_train(False) + + model = Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + res = model.eval(dataset) + print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/example/resnet101_imagenet/lr_generator.py b/example/resnet101_imagenet/lr_generator.py new file mode 100755 index 0000000000..b2271a1382 --- /dev/null +++ b/example/resnet101_imagenet/lr_generator.py @@ -0,0 +1,113 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import numpy as np +import math + + +def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): + lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps) + lr = float(init_lr) + lr_inc * current_step + return lr + +def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): + """ + generate learning rate array with cosine + + Args: + lr(float): base learning rate + steps_per_epoch(int): steps size of one epoch + warmup_epochs(int): number of warmup epochs + max_epoch(int): total epochs of training + Returns: + np.array, learning rate array + """ + base_lr = lr + warmup_init_lr = 0 + total_steps = int(max_epoch * steps_per_epoch) + warmup_steps = int(warmup_epochs * steps_per_epoch) + decay_steps = total_steps - warmup_steps + + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr) + else: + linear_decay = (total_steps - i) / decay_steps + cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps)) + decayed = linear_decay * cosine_decay + 0.00001 + lr = base_lr * decayed + lr_each_step.append(lr) + return np.array(lr_each_step).astype(np.float32) + +def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): + """ + generate learning rate array + + Args: + global_step(int): total steps of the training + lr_init(float): init learning rate + lr_end(float): end learning rate + lr_max(float): max learning rate + warmup_epochs(int): number of warmup epochs + total_epochs(int): total epoch of training + steps_per_epoch(int): steps of one epoch + lr_decay_mode(string): learning rate decay mode, including steps, poly or default + + Returns: + np.array, learning rate array + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + if lr_decay_mode == 'steps': + decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps] + for i in range(total_steps): + if i < decay_epoch_index[0]: + lr = lr_max + elif i < decay_epoch_index[1]: + lr = lr_max * 0.1 + elif i < decay_epoch_index[2]: + lr = lr_max * 0.01 + else: + lr = lr_max * 0.001 + lr_each_step.append(lr) + elif lr_decay_mode == 'poly': + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(total_steps): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) + lr = float(lr_max) * base * base + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + else: + for i in range(total_steps): + if i < warmup_steps: + lr = lr_init + (lr_max - lr_init) * i / warmup_steps + else: + lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) + lr_each_step.append(lr) + + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + + return learning_rate diff --git a/example/resnet101_imagenet/run_distribute_train.sh b/example/resnet101_imagenet/run_distribute_train.sh new file mode 100755 index 0000000000..5165f58cab --- /dev/null +++ b/example/resnet101_imagenet/run_distribute_train.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" +exit 1 +fi + +if [ ! -f $1 ] +then + echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" +exit 1 +fi + +if [ ! -d $2 ] +then + echo "error: DATASET_PATH=$2 is not a directory" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +export MINDSPORE_HCCL_CONFIG_PATH=$1 +export RANK_TABLE_FILE=$1 + +for((i=0; i<${DEVICE_NUM}; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp *.py ./train_parallel$i + cp *.sh ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 &> log & + cd .. +done diff --git a/example/resnet101_imagenet/run_infer.sh b/example/resnet101_imagenet/run_infer.sh new file mode 100755 index 0000000000..5df659275e --- /dev/null +++ b/example/resnet101_imagenet/run_infer.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +if [ ! -d $1 ] +then + echo "error: DATASET_PATH=$1 is not a directory" +exit 1 +fi + +if [ ! -f $2 ] +then + echo "error: CHECKPOINT_PATH=$2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 + +if [ -d "infer" ]; +then + rm -rf ./infer +fi +mkdir ./infer +cp *.py ./infer +cp *.sh ./infer +cd ./infer || exit +env > env.log +echo "start infering for device $DEVICE_ID" +python eval.py --do_eval=True --dataset_path=$1 --checkpoint_path=$2 &> log & +cd .. diff --git a/example/resnet101_imagenet/run_standalone_train.sh b/example/resnet101_imagenet/run_standalone_train.sh new file mode 100755 index 0000000000..9ba5742515 --- /dev/null +++ b/example/resnet101_imagenet/run_standalone_train.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] +then + echo "Usage: sh run_standalone_train.sh [DATASET_PATH]" +exit 1 +fi + +if [ ! -d $1 ] +then + echo "error: DATASET_PATH=$1 is not a directory" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp *.py ./train +cp *.sh ./train +cd ./train || exit +echo "start training for device $DEVICE_ID" +env > env.log +python train.py --do_train=True --dataset_path=$1 &> log & +cd .. diff --git a/example/resnet101_imagenet/train.py b/example/resnet101_imagenet/train.py new file mode 100755 index 0000000000..2df6c3bad4 --- /dev/null +++ b/example/resnet101_imagenet/train.py @@ -0,0 +1,113 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train_imagenet.""" +import os +import argparse +import random +import numpy as np +from dataset import create_dataset +from lr_generator import get_lr +from config import config +from mindspore import context +from mindspore import Tensor +from mindspore.model_zoo.resnet import resnet101 +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.nn.optim.momentum import Momentum +from mindspore.train.model import Model, ParallelMode +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.loss_scale_manager import FixedLossScaleManager +import mindspore.dataset.engine as de +from mindspore.communication.management import init +import mindspore.nn as nn +from crossentropy import CrossEntropy +from var_init import default_recurisive_init, KaimingNormal +from mindspore.common import initializer as weight_init + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) +context.set_context(enable_task_sink=True) +context.set_context(enable_loop_sink=True) +context.set_context(enable_mem_reuse=True) + +if __name__ == '__main__': + if args_opt.do_eval: + context.set_context(enable_hccl=False) + else: + if args_opt.run_distribute: + context.set_context(enable_hccl=True) + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() + else: + context.set_context(enable_hccl=False) + + epoch_size = config.epoch_size + net = resnet101(class_num=config.class_num) + + # weight init + default_recurisive_init(net) + for name, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + cell.weight.default_input = weight_init.initializer(KaimingNormal(a=math.sqrt(5), + mode='fan_out', nonlinearity='relu'), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) + + if not config.label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + if args_opt.do_train: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, + repeat_num=epoch_size, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + + # learning rate strategy + if config.lr_decay_mode == 'cosine': + lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) + else: + lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, + warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, + lr_decay_mode='poly')) + + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, + config.weight_decay, config.loss_scale) + + model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, loss_scale_manager=loss_scale, metrics={'acc'}) + + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossMonitor() + cb = [time_cb, loss_cb] + if config.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint(prefix="resnet", directory=config.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + model.train(epoch_size, dataset, callbacks=cb) diff --git a/example/resnet101_imagenet/var_init.py b/example/resnet101_imagenet/var_init.py new file mode 100755 index 0000000000..af4cd64b3b --- /dev/null +++ b/example/resnet101_imagenet/var_init.py @@ -0,0 +1,183 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""weight initial""" +import math +import numpy as np +from mindspore.common import initializer as init +import mindspore.nn as nn +from mindspore import Tensor + + +def calculate_gain(nonlinearity, param=None): + r"""Return the recommended gain value for the given nonlinearity function. + The values are as follows: + + ================= ==================================================== + nonlinearity gain + ================= ==================================================== + Linear / Identity :math:`1` + Conv{1,2,3}D :math:`1` + Sigmoid :math:`1` + Tanh :math:`\frac{5}{3}` + ReLU :math:`\sqrt{2}` + Leaky Relu :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}` + ================= ==================================================== + + Args: + nonlinearity: the non-linear function (`nn.functional` name) + param: optional parameter for the non-linear function + + """ + linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d'] + if nonlinearity in linear_fns or nonlinearity == 'sigmoid': + return 1 + elif nonlinearity == 'tanh': + return 5.0 / 3 + elif nonlinearity == 'relu': + return math.sqrt(2.0) + elif nonlinearity == 'leaky_relu': + if param is None: + negative_slope = 0.01 + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): + # True/False are instances of int, hence check above + negative_slope = param + else: + raise ValueError("negative_slope {} not a valid number".format(param)) + return math.sqrt(2.0 / (1 + negative_slope ** 2)) + else: + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + +def _calculate_correct_fan(array, mode): + mode = mode.lower() + valid_modes = ['fan_in', 'fan_out'] + if mode not in valid_modes: + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) + + fan_in, fan_out = _calculate_fan_in_and_fan_out(array) + return fan_in if mode == 'fan_in' else fan_out + + +def kaiming_uniform_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): + r"""Fills the input `Tensor` with values according to the method + described in `Delving deep into rectifiers: Surpassing human-level + performance on ImageNet classification` - He, K. et al. (2015), using a + uniform distribution. The resulting tensor will have values sampled from + :math:`\mathcal{U}(-\text{bound}, \text{bound})` where + + .. math:: + \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}} + + Also known as He initialization. + + Args: + array: an n-dimensional `tensor` + a: the negative slope of the rectifier used after this layer (only + used with ``'leaky_relu'``) + mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'`` + preserves the magnitude of the variance of the weights in the + forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the + backwards pass. + nonlinearity: the non-linear function (`nn.functional` name), + recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default). + """ + + fan = _calculate_correct_fan(array, mode) + gain = calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation + return np.random.uniform(-bound, bound, array.shape) + + +def kaiming_normal_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): + r"""Fills the input `Tensor` with values according to the method + described in `Delving deep into rectifiers: Surpassing human-level + performance on ImageNet classification` - He, K. et al. (2015), using a + normal distribution. The resulting tensor will have values sampled from + :math:`\mathcal{N}(0, \text{std}^2)` where + + .. math:: + \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}} + + Also known as He initialization. + + Args: + array: an n-dimensional `tensor` + a: the negative slope of the rectifier used after this layer (only + used with ``'leaky_relu'``) + mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'`` + preserves the magnitude of the variance of the weights in the + forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the + backwards pass. + nonlinearity: the non-linear function (`nn.functional` name), + recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default). + """ + fan = _calculate_correct_fan(array, mode) + gain = calculate_gain(nonlinearity, a) + std = gain / math.sqrt(fan) + return np.random.normal(0, std, array.shape) + +def _calculate_fan_in_and_fan_out(array): + dimensions = len(array.shape) + if dimensions < 2: + raise ValueError("Fan in and fan out can not be computed for array with fewer than 2 dimensions") + + num_input_fmaps = array.shape[1] + num_output_fmaps = array.shape[0] + receptive_field_size = 1 + if dimensions > 2: + receptive_field_size = array[0][0].size + fan_in = num_input_fmaps * receptive_field_size + fan_out = num_output_fmaps * receptive_field_size + + return fan_in, fan_out + +class KaimingUniform(init.Initializer): + def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): + super(KaimingUniform, self).__init__() + self.a = a + self.mode = mode + self.nonlinearity = nonlinearity + + def _initialize(self, arr): + tmp = kaiming_uniform_(arr, self.a, self.mode, self.nonlinearity) + init._assignment(arr, tmp) + +class KaimingNormal(init.Initializer): + def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): + super(KaimingNormal, self).__init__() + self.a = a + self.mode = mode + self.nonlinearity = nonlinearity + + def _initialize(self, arr): + tmp = kaiming_normal_(arr, self.a, self.mode, self.nonlinearity) + init._assignment(arr, tmp) + +def default_recurisive_init(custom_cell): + for name, cell in custom_cell.cells_and_names(): + if isinstance(cell, nn.Conv2d): + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.default_input.shape(), cell.weight.default_input.dtype()) + if cell.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) + bound = 1 / math.sqrt(fan_in) + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, cell.bias.default_input.shape()), cell.bias.default_input.dtype()) + elif isinstance(cell, nn.Dense): + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.default_input.shape(), cell.weight.default_input.dtype()) + if cell.bias is not None: + fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) + bound = 1 / math.sqrt(fan_in) + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, cell.bias.default_input.shape()), cell.bias.default_input.dtype()) + elif isinstance(cell, nn.BatchNorm2d) or isinstance(cell, nn.BatchNorm1d): + pass diff --git a/mindspore/model_zoo/resnet.py b/mindspore/model_zoo/resnet.py index 9d010eede1..a243ff5a2a 100755 --- a/mindspore/model_zoo/resnet.py +++ b/mindspore/model_zoo/resnet.py @@ -260,3 +260,24 @@ def resnet50(class_num=10): [256, 512, 1024, 2048], [1, 2, 2, 2], class_num) + +def resnet101(class_num=1001): + """ + Get ResNet101 neural network. + + Args: + class_num (int): Class number. + + Returns: + Cell, cell instance of ResNet101 neural network. + + Examples: + >>> net = resnet101(1001) + """ + return ResNet(ResidualBlock, + [3, 4, 23, 3], + [64, 256, 512, 1024], + [256, 512, 1024, 2048], + [1, 2, 2, 2], + class_num) + From 99bbb3a3b2a0eac1c224256eb2782149733b10ef Mon Sep 17 00:00:00 2001 From: meixiaowei Date: Sun, 26 Apr 2020 17:25:12 +0800 Subject: [PATCH 2/3] modify scripts for pylint --- example/resnet101_imagenet/crossentropy.py | 6 +-- example/resnet101_imagenet/dataset.py | 2 +- example/resnet101_imagenet/lr_generator.py | 5 +-- example/resnet101_imagenet/train.py | 20 ++++------ example/resnet101_imagenet/var_init.py | 43 +++++++++++----------- mindspore/model_zoo/resnet.py | 3 +- 6 files changed, 37 insertions(+), 42 deletions(-) diff --git a/example/resnet101_imagenet/crossentropy.py b/example/resnet101_imagenet/crossentropy.py index e636b8529e..1145a41804 100755 --- a/example/resnet101_imagenet/crossentropy.py +++ b/example/resnet101_imagenet/crossentropy.py @@ -12,15 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +"""define loss function for network""" from mindspore.nn.loss.loss import _Loss from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore import Tensor from mindspore.common import dtype as mstype import mindspore.nn as nn - -"""define loss function for network""" + class CrossEntropy(_Loss): + """the redefined loss function with SoftmaxCrossEntropyWithLogits""" def __init__(self, smooth_factor=0., num_classes=1001): super(CrossEntropy, self).__init__() self.onehot = P.OneHot() @@ -28,7 +29,6 @@ class CrossEntropy(_Loss): self.off_value = Tensor(1.0 * smooth_factor / (num_classes -1), mstype.float32) self.ce = nn.SoftmaxCrossEntropyWithLogits() self.mean = P.ReduceMean(False) - def construct(self, logit, label): one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) loss = self.ce(logit, one_hot_label) diff --git a/example/resnet101_imagenet/dataset.py b/example/resnet101_imagenet/dataset.py index 920e1c093c..27d93dc086 100755 --- a/example/resnet101_imagenet/dataset.py +++ b/example/resnet101_imagenet/dataset.py @@ -57,7 +57,7 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): normalize_op = C.Normalize((0.475, 0.451, 0.392), (0.275, 0.267, 0.278)) changeswap_op = C.HWC2CHW() - trans=[] + trans = [] if do_train: trans = [decode_op, random_resize_crop_op, diff --git a/example/resnet101_imagenet/lr_generator.py b/example/resnet101_imagenet/lr_generator.py index b2271a1382..67ff1fef25 100755 --- a/example/resnet101_imagenet/lr_generator.py +++ b/example/resnet101_imagenet/lr_generator.py @@ -13,9 +13,8 @@ # limitations under the License. # ============================================================================ """learning rate generator""" -import numpy as np import math - +import numpy as np def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps) @@ -50,7 +49,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): decayed = linear_decay * cosine_decay + 0.00001 lr = base_lr * decayed lr_each_step.append(lr) - return np.array(lr_each_step).astype(np.float32) + return np.array(lr_each_step).astype(np.float32) def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): """ diff --git a/example/resnet101_imagenet/train.py b/example/resnet101_imagenet/train.py index 2df6c3bad4..37f49ec3d7 100755 --- a/example/resnet101_imagenet/train.py +++ b/example/resnet101_imagenet/train.py @@ -14,11 +14,12 @@ # ============================================================================ """train_imagenet.""" import os +import math import argparse import random import numpy as np from dataset import create_dataset -from lr_generator import get_lr +from lr_generator import get_lr, warmup_cosine_annealing_lr from config import config from mindspore import context from mindspore import Tensor @@ -33,7 +34,7 @@ from mindspore.communication.management import init import mindspore.nn as nn from crossentropy import CrossEntropy from var_init import default_recurisive_init, KaimingNormal -from mindspore.common import initializer as weight_init +import mindspore.common.initializer as weight_init random.seed(1) np.random.seed(1) @@ -69,23 +70,20 @@ if __name__ == '__main__': epoch_size = config.epoch_size net = resnet101(class_num=config.class_num) - # weight init default_recurisive_init(net) for name, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.default_input = weight_init.initializer(KaimingNormal(a=math.sqrt(5), - mode='fan_out', nonlinearity='relu'), + mode='fan_out', nonlinearity='relu'), cell.weight.default_input.shape(), cell.weight.default_input.dtype()) - if not config.label_smooth: config.label_smooth_factor = 0.0 - loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) - + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) if args_opt.do_train: dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, - repeat_num=epoch_size, batch_size=config.batch_size) + repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) @@ -96,12 +94,10 @@ if __name__ == '__main__': lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, lr_decay_mode='poly')) - opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) - - model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, loss_scale_manager=loss_scale, metrics={'acc'}) - + model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, + loss_scale_manager=loss_scale, metrics={'acc'}) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() cb = [time_cb, loss_cb] diff --git a/example/resnet101_imagenet/var_init.py b/example/resnet101_imagenet/var_init.py index af4cd64b3b..061ec94fbf 100755 --- a/example/resnet101_imagenet/var_init.py +++ b/example/resnet101_imagenet/var_init.py @@ -18,12 +18,10 @@ import numpy as np from mindspore.common import initializer as init import mindspore.nn as nn from mindspore import Tensor - def calculate_gain(nonlinearity, param=None): r"""Return the recommended gain value for the given nonlinearity function. - The values are as follows: - + The values are as follows: ================= ==================================================== nonlinearity gain ================= ==================================================== @@ -34,11 +32,9 @@ def calculate_gain(nonlinearity, param=None): ReLU :math:`\sqrt{2}` Leaky Relu :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}` ================= ==================================================== - Args: nonlinearity: the non-linear function (`nn.functional` name) param: optional parameter for the non-linear function - """ linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d'] if nonlinearity in linear_fns or nonlinearity == 'sigmoid': @@ -57,17 +53,15 @@ def calculate_gain(nonlinearity, param=None): raise ValueError("negative_slope {} not a valid number".format(param)) return math.sqrt(2.0 / (1 + negative_slope ** 2)) else: - raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) - + raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) + def _calculate_correct_fan(array, mode): mode = mode.lower() valid_modes = ['fan_in', 'fan_out'] if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) - + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(array) - return fan_in if mode == 'fan_in' else fan_out - + return fan_in if mode == 'fan_in' else fan_out def kaiming_uniform_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): r"""Fills the input `Tensor` with values according to the method @@ -75,12 +69,10 @@ def kaiming_uniform_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): performance on ImageNet classification` - He, K. et al. (2015), using a uniform distribution. The resulting tensor will have values sampled from :math:`\mathcal{U}(-\text{bound}, \text{bound})` where - .. math:: \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}} - Also known as He initialization. - + Args: array: an n-dimensional `tensor` a: the negative slope of the rectifier used after this layer (only @@ -91,8 +83,7 @@ def kaiming_uniform_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): backwards pass. nonlinearity: the non-linear function (`nn.functional` name), recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default). - """ - + """ fan = _calculate_correct_fan(array, mode) gain = calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) @@ -129,6 +120,7 @@ def kaiming_normal_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): return np.random.normal(0, std, array.shape) def _calculate_fan_in_and_fan_out(array): + """calculate the fan_in and fan_out for input array""" dimensions = len(array.shape) if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for array with fewer than 2 dimensions") @@ -166,18 +158,27 @@ class KaimingNormal(init.Initializer): init._assignment(arr, tmp) def default_recurisive_init(custom_cell): + """weight init for conv2d and dense""" for name, cell in custom_cell.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.default_input.shape(), cell.weight.default_input.dtype()) + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) bound = 1 / math.sqrt(fan_in) - cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, cell.bias.default_input.shape()), cell.bias.default_input.dtype()) + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, + cell.bias.default_input.shape()), + cell.bias.default_input.dtype()) elif isinstance(cell, nn.Dense): - cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), cell.weight.default_input.shape(), cell.weight.default_input.dtype()) + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) bound = 1 / math.sqrt(fan_in) - cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, cell.bias.default_input.shape()), cell.bias.default_input.dtype()) - elif isinstance(cell, nn.BatchNorm2d) or isinstance(cell, nn.BatchNorm1d): + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, + cell.bias.default_input.shape()), + cell.bias.default_input.dtype()) + elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)): pass diff --git a/mindspore/model_zoo/resnet.py b/mindspore/model_zoo/resnet.py index a243ff5a2a..d67f26814c 100755 --- a/mindspore/model_zoo/resnet.py +++ b/mindspore/model_zoo/resnet.py @@ -279,5 +279,4 @@ def resnet101(class_num=1001): [64, 256, 512, 1024], [256, 512, 1024, 2048], [1, 2, 2, 2], - class_num) - + class_num) \ No newline at end of file From 3cb692bea10e477deffdbe631bc88fc30f93622e Mon Sep 17 00:00:00 2001 From: meixiaowei Date: Sun, 26 Apr 2020 17:57:12 +0800 Subject: [PATCH 3/3] modify resnet101 scripts for pylint --- example/resnet101_imagenet/README.md | 3 - example/resnet101_imagenet/config.py | 3 - example/resnet101_imagenet/lr_generator.py | 60 ----------------- example/resnet101_imagenet/train.py | 17 ++--- example/resnet101_imagenet/var_init.py | 76 ++++++++++++---------- mindspore/model_zoo/resnet.py | 2 +- 6 files changed, 49 insertions(+), 112 deletions(-) diff --git a/example/resnet101_imagenet/README.md b/example/resnet101_imagenet/README.md index bc653675f2..d5729b70db 100644 --- a/example/resnet101_imagenet/README.md +++ b/example/resnet101_imagenet/README.md @@ -54,9 +54,6 @@ Parameters for both training and evaluating can be set in config.py. "save_checkpoint_steps": 500, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step "keep_checkpoint_max": 40, # only keep the last keep_checkpoint_max checkpoint "save_checkpoint_path": "./", # path to save checkpoint relative to the executed path -"lr_init": 0.01, # initial learning rate -"lr_end": 0.00001, # final learning rate -"lr_max": 0.1, # maximum learning rate "warmup_epochs": 0, # number of warmup epoch "lr_decay_mode": "cosine" # decay mode for generating learning rate "label_smooth": 1, # label_smooth diff --git a/example/resnet101_imagenet/config.py b/example/resnet101_imagenet/config.py index 0ad37c8678..ca58f24da3 100755 --- a/example/resnet101_imagenet/config.py +++ b/example/resnet101_imagenet/config.py @@ -31,9 +31,6 @@ config = ed({ "save_checkpoint_steps": 500, "keep_checkpoint_max": 40, "save_checkpoint_path": "./", - "lr_init": 0.01, - "lr_end": 0.00001, - "lr_max": 0.1, "warmup_epochs": 0, "lr_decay_mode": "cosine", "label_smooth": 1, diff --git a/example/resnet101_imagenet/lr_generator.py b/example/resnet101_imagenet/lr_generator.py index 67ff1fef25..88cb85cc5b 100755 --- a/example/resnet101_imagenet/lr_generator.py +++ b/example/resnet101_imagenet/lr_generator.py @@ -50,63 +50,3 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): lr = base_lr * decayed lr_each_step.append(lr) return np.array(lr_each_step).astype(np.float32) - -def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): - """ - generate learning rate array - - Args: - global_step(int): total steps of the training - lr_init(float): init learning rate - lr_end(float): end learning rate - lr_max(float): max learning rate - warmup_epochs(int): number of warmup epochs - total_epochs(int): total epoch of training - steps_per_epoch(int): steps of one epoch - lr_decay_mode(string): learning rate decay mode, including steps, poly or default - - Returns: - np.array, learning rate array - """ - lr_each_step = [] - total_steps = steps_per_epoch * total_epochs - warmup_steps = steps_per_epoch * warmup_epochs - if lr_decay_mode == 'steps': - decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps] - for i in range(total_steps): - if i < decay_epoch_index[0]: - lr = lr_max - elif i < decay_epoch_index[1]: - lr = lr_max * 0.1 - elif i < decay_epoch_index[2]: - lr = lr_max * 0.01 - else: - lr = lr_max * 0.001 - lr_each_step.append(lr) - elif lr_decay_mode == 'poly': - if warmup_steps != 0: - inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) - else: - inc_each_step = 0 - for i in range(total_steps): - if i < warmup_steps: - lr = float(lr_init) + inc_each_step * float(i) - else: - base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) - lr = float(lr_max) * base * base - if lr < 0.0: - lr = 0.0 - lr_each_step.append(lr) - else: - for i in range(total_steps): - if i < warmup_steps: - lr = lr_init + (lr_max - lr_init) * i / warmup_steps - else: - lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) - lr_each_step.append(lr) - - current_step = global_step - lr_each_step = np.array(lr_each_step).astype(np.float32) - learning_rate = lr_each_step[current_step:] - - return learning_rate diff --git a/example/resnet101_imagenet/train.py b/example/resnet101_imagenet/train.py index 37f49ec3d7..0f20637595 100755 --- a/example/resnet101_imagenet/train.py +++ b/example/resnet101_imagenet/train.py @@ -19,7 +19,7 @@ import argparse import random import numpy as np from dataset import create_dataset -from lr_generator import get_lr, warmup_cosine_annealing_lr +from lr_generator import warmup_cosine_annealing_lr from config import config from mindspore import context from mindspore import Tensor @@ -32,9 +32,9 @@ from mindspore.train.loss_scale_manager import FixedLossScaleManager import mindspore.dataset.engine as de from mindspore.communication.management import init import mindspore.nn as nn +import mindspore.common.initializer as weight_init from crossentropy import CrossEntropy from var_init import default_recurisive_init, KaimingNormal -import mindspore.common.initializer as weight_init random.seed(1) np.random.seed(1) @@ -72,7 +72,7 @@ if __name__ == '__main__': net = resnet101(class_num=config.class_num) # weight init default_recurisive_init(net) - for name, cell in net.cells_and_names(): + for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.default_input = weight_init.initializer(KaimingNormal(a=math.sqrt(5), mode='fan_out', nonlinearity='relu'), @@ -83,17 +83,12 @@ if __name__ == '__main__': loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) if args_opt.do_train: dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, - repeat_num=epoch_size, batch_size=config.batch_size) + repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) - # learning rate strategy - if config.lr_decay_mode == 'cosine': - lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) - else: - lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, - warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, - lr_decay_mode='poly')) + # learning rate strategy with cosine + lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, diff --git a/example/resnet101_imagenet/var_init.py b/example/resnet101_imagenet/var_init.py index 061ec94fbf..34d8664a49 100755 --- a/example/resnet101_imagenet/var_init.py +++ b/example/resnet101_imagenet/var_init.py @@ -18,10 +18,10 @@ import numpy as np from mindspore.common import initializer as init import mindspore.nn as nn from mindspore import Tensor - + def calculate_gain(nonlinearity, param=None): r"""Return the recommended gain value for the given nonlinearity function. - The values are as follows: + The values are as follows: ================= ==================================================== nonlinearity gain ================= ==================================================== @@ -37,12 +37,13 @@ def calculate_gain(nonlinearity, param=None): param: optional parameter for the non-linear function """ linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d'] + gain = 0 if nonlinearity in linear_fns or nonlinearity == 'sigmoid': - return 1 + gain = 1 elif nonlinearity == 'tanh': - return 5.0 / 3 + gain = 5.0 / 3 elif nonlinearity == 'relu': - return math.sqrt(2.0) + gain = math.sqrt(2.0) elif nonlinearity == 'leaky_relu': if param is None: negative_slope = 0.01 @@ -51,15 +52,16 @@ def calculate_gain(nonlinearity, param=None): negative_slope = param else: raise ValueError("negative_slope {} not a valid number".format(param)) - return math.sqrt(2.0 / (1 + negative_slope ** 2)) + gain = math.sqrt(2.0 / (1 + negative_slope ** 2)) else: raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) - + return gain + def _calculate_correct_fan(array, mode): mode = mode.lower() valid_modes = ['fan_in', 'fan_out'] if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(array) return fan_in if mode == 'fan_in' else fan_out @@ -83,13 +85,12 @@ def kaiming_uniform_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): backwards pass. nonlinearity: the non-linear function (`nn.functional` name), recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default). - """ + """ fan = _calculate_correct_fan(array, mode) gain = calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation return np.random.uniform(-bound, bound, array.shape) - def kaiming_normal_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): r"""Fills the input `Tensor` with values according to the method @@ -97,12 +98,10 @@ def kaiming_normal_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): performance on ImageNet classification` - He, K. et al. (2015), using a normal distribution. The resulting tensor will have values sampled from :math:`\mathcal{N}(0, \text{std}^2)` where - .. math:: \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}} - Also known as He initialization. - + Args: array: an n-dimensional `tensor` a: the negative slope of the rectifier used after this layer (only @@ -118,13 +117,12 @@ def kaiming_normal_(array, a=0, mode='fan_in', nonlinearity='leaky_relu'): gain = calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) return np.random.normal(0, std, array.shape) - + def _calculate_fan_in_and_fan_out(array): """calculate the fan_in and fan_out for input array""" dimensions = len(array.shape) if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for array with fewer than 2 dimensions") - num_input_fmaps = array.shape[1] num_output_fmaps = array.shape[0] receptive_field_size = 1 @@ -132,19 +130,30 @@ def _calculate_fan_in_and_fan_out(array): receptive_field_size = array[0][0].size fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size - return fan_in, fan_out - + +def assignment(arr, num): + """Assign the value of num to arr""" + if arr.shape == (): + arr = arr.reshape((1)) + arr[:] = num + arr = arr.reshape(()) + else: + if isinstance(num, np.ndarray): + arr[:] = num[:] + else: + arr[:] = num + return arr + class KaimingUniform(init.Initializer): def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): super(KaimingUniform, self).__init__() self.a = a self.mode = mode self.nonlinearity = nonlinearity - def _initialize(self, arr): tmp = kaiming_uniform_(arr, self.a, self.mode, self.nonlinearity) - init._assignment(arr, tmp) + assignment(arr, tmp) class KaimingNormal(init.Initializer): def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'): @@ -152,33 +161,32 @@ class KaimingNormal(init.Initializer): self.a = a self.mode = mode self.nonlinearity = nonlinearity - def _initialize(self, arr): tmp = kaiming_normal_(arr, self.a, self.mode, self.nonlinearity) - init._assignment(arr, tmp) + assignment(arr, tmp) def default_recurisive_init(custom_cell): """weight init for conv2d and dense""" - for name, cell in custom_cell.cells_and_names(): + for _, cell in custom_cell.cells_and_names(): if isinstance(cell, nn.Conv2d): - cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), - cell.weight.default_input.shape(), - cell.weight.default_input.dtype()) + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) bound = 1 / math.sqrt(fan_in) - cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, - cell.bias.default_input.shape()), - cell.bias.default_input.dtype()) + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, + cell.bias.default_input.shape()), + cell.bias.default_input.dtype()) elif isinstance(cell, nn.Dense): - cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), - cell.weight.default_input.shape(), - cell.weight.default_input.dtype()) + cell.weight.default_input = init.initializer(KaimingUniform(a=math.sqrt(5)), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) if cell.bias is not None: fan_in, _ = _calculate_fan_in_and_fan_out(cell.weight.default_input.asnumpy()) bound = 1 / math.sqrt(fan_in) - cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, - cell.bias.default_input.shape()), - cell.bias.default_input.dtype()) + cell.bias.default_input = Tensor(np.random.uniform(-bound, bound, + cell.bias.default_input.shape()), + cell.bias.default_input.dtype()) elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)): pass diff --git a/mindspore/model_zoo/resnet.py b/mindspore/model_zoo/resnet.py index d67f26814c..3055026718 100755 --- a/mindspore/model_zoo/resnet.py +++ b/mindspore/model_zoo/resnet.py @@ -279,4 +279,4 @@ def resnet101(class_num=1001): [64, 256, 512, 1024], [256, 512, 1024, 2048], [1, 2, 2, 2], - class_num) \ No newline at end of file + class_num)