2. add relative path support in resnet50 example 3. optimize allreuce split strategytags/v0.3.0-alpha
| @@ -20,22 +20,33 @@ then | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| if [ ! -f $1 ] | |||||
| get_real_path(){ | |||||
| if [ "${1:0:1}" == "/" ]; then | |||||
| echo "$1" | |||||
| else | |||||
| echo "$(realpath -m $PWD/$1)" | |||||
| fi | |||||
| } | |||||
| PATH1=$(get_real_path $1) | |||||
| PATH2=$(get_real_path $2) | |||||
| if [ ! -f "$PATH1" ] | |||||
| then | then | ||||
| echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" | |||||
| echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| if [ ! -d $2 ] | |||||
| if [ ! -d "$PATH2" ] | |||||
| then | then | ||||
| echo "error: DATASET_PATH=$2 is not a directory" | |||||
| echo "error: DATASET_PATH=$PATH2 is not a directory" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| ulimit -u unlimited | ulimit -u unlimited | ||||
| export DEVICE_NUM=8 | export DEVICE_NUM=8 | ||||
| export RANK_SIZE=8 | export RANK_SIZE=8 | ||||
| export MINDSPORE_HCCL_CONFIG_PATH=$1 | |||||
| export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 | |||||
| for((i=0; i<${DEVICE_NUM}; i++)) | for((i=0; i<${DEVICE_NUM}; i++)) | ||||
| do | do | ||||
| @@ -48,6 +59,6 @@ do | |||||
| cd ./train_parallel$i || exit | cd ./train_parallel$i || exit | ||||
| echo "start training for rank $RANK_ID, device $DEVICE_ID" | echo "start training for rank $RANK_ID, device $DEVICE_ID" | ||||
| env > env.log | env > env.log | ||||
| python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$2 &> log & | |||||
| python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & | |||||
| cd .. | cd .. | ||||
| done | done | ||||
| @@ -20,9 +20,19 @@ then | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| if [ ! -d $1 ] | |||||
| get_real_path(){ | |||||
| if [ "${1:0:1}" == "/" ]; then | |||||
| echo "$1" | |||||
| else | |||||
| echo "$(realpath -m $PWD/$1)" | |||||
| fi | |||||
| } | |||||
| PATH1=$(get_real_path $1) | |||||
| if [ ! -d "$PATH1" ] | |||||
| then | then | ||||
| echo "error: DATASET_PATH=$1 is not a directory" | |||||
| echo "error: DATASET_PATH=$PATH1 is not a directory" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -41,5 +51,5 @@ cp *.sh ./train | |||||
| cd ./train || exit | cd ./train || exit | ||||
| echo "start training for device $DEVICE_ID" | echo "start training for device $DEVICE_ID" | ||||
| env > env.log | env > env.log | ||||
| python train.py --do_train=True --dataset_path=$1 &> log & | |||||
| python train.py --do_train=True --dataset_path=$PATH1 &> log & | |||||
| cd .. | cd .. | ||||
| @@ -57,12 +57,12 @@ if __name__ == '__main__': | |||||
| if not args_opt.do_eval and args_opt.run_distribute: | if not args_opt.do_eval and args_opt.run_distribute: | ||||
| context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, | ||||
| mirror_mean=True) | mirror_mean=True) | ||||
| auto_parallel_context().set_all_reduce_fusion_split_indices([140]) | |||||
| auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) | |||||
| init() | init() | ||||
| epoch_size = config.epoch_size | epoch_size = config.epoch_size | ||||
| net = resnet50(class_num=config.class_num) | net = resnet50(class_num=config.class_num) | ||||
| loss = SoftmaxCrossEntropyWithLogits(sparse=True) | |||||
| loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') | |||||
| if args_opt.do_train: | if args_opt.do_train: | ||||
| @@ -168,7 +168,7 @@ class ResNet(nn.Cell): | |||||
| self.conv1 = _conv7x7(3, 64, stride=2) | self.conv1 = _conv7x7(3, 64, stride=2) | ||||
| self.bn1 = _bn(64) | self.bn1 = _bn(64) | ||||
| self.relu = P.ReLU() | self.relu = P.ReLU() | ||||
| self.maxpool = P.MaxPoolWithArgmax(padding="same", ksize=3, strides=2) | |||||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same") | |||||
| self.layer1 = self._make_layer(block, | self.layer1 = self._make_layer(block, | ||||
| layer_nums[0], | layer_nums[0], | ||||
| @@ -227,7 +227,7 @@ class ResNet(nn.Cell): | |||||
| x = self.conv1(x) | x = self.conv1(x) | ||||
| x = self.bn1(x) | x = self.bn1(x) | ||||
| x = self.relu(x) | x = self.relu(x) | ||||
| c1, argmax = self.maxpool(x) | |||||
| c1 = self.maxpool(x) | |||||
| c2 = self.layer1(c1) | c2 = self.layer1(c1) | ||||
| c3 = self.layer2(c2) | c3 = self.layer2(c2) | ||||