| @@ -1,58 +0,0 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export RANK_SIZE=$1 | |||||
| export EPOCH_SIZE=$2 | |||||
| export DATASET=$3 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=$RANK_SIZE | |||||
| export MS_SERVER_NUM=$4 | |||||
| export MS_SCHED_HOST=$5 | |||||
| export MS_SCHED_PORT=$6 | |||||
| export MS_ROLE=MS_SCHED | |||||
| for((i=0;i<1;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/sched_$i/ | |||||
| mkdir ${execute_path}/sched_$i/ | |||||
| cd ${execute_path}/sched_$i/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| rm -rf ${execute_path}/worker/ | |||||
| mkdir ${execute_path}/worker/ | |||||
| cd ${execute_path}/worker/ || exit | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 & | |||||
| @@ -1,68 +0,0 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export RANK_SIZE=$1 | |||||
| export EPOCH_SIZE=$2 | |||||
| export DATASET=$3 | |||||
| export RANK_TABLE_FILE=$4 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=$RANK_SIZE | |||||
| export MS_SERVER_NUM=$5 | |||||
| export MS_SCHED_HOST=$6 | |||||
| export MS_SCHED_PORT=$7 | |||||
| export MS_ROLE=MS_SCHED | |||||
| for((i=0;i<1;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/sched_$i/ | |||||
| mkdir ${execute_path}/sched_$i/ | |||||
| cd ${execute_path}/sched_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/worker_$i/ | |||||
| mkdir ${execute_path}/worker_$i/ | |||||
| cd ${execute_path}/worker_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 & | |||||
| done | |||||
| @@ -1,56 +0,0 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export EPOCH_SIZE=$1 | |||||
| export DEVICE_TARGET=$2 | |||||
| export DATASET=$3 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=1 | |||||
| export MS_SERVER_NUM=$4 | |||||
| export MS_SCHED_HOST=$5 | |||||
| export MS_SCHED_PORT=$6 | |||||
| export MS_ROLE=MS_SCHED | |||||
| rm -rf ${execute_path}/sched/ | |||||
| mkdir ${execute_path}/sched/ | |||||
| cd ${execute_path}/sched/ || exit | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||||
| --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 & | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||||
| --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| rm -rf ${execute_path}/worker/ | |||||
| mkdir ${execute_path}/worker/ | |||||
| cd ${execute_path}/worker/ || exit | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||||
| --parameter_server=1 --vocab_cache_size=300000 \ | |||||
| --dropout_flag=1 >worker.log 2>&1 & | |||||
| @@ -1,63 +0,0 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export RANK_SIZE=$1 | |||||
| export EPOCH_SIZE=$2 | |||||
| export DATASET=$3 | |||||
| export RANK_TABLE_FILE=$4 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=$RANK_SIZE | |||||
| export MS_SERVER_NUM=$5 | |||||
| export MS_SCHED_HOST=$6 | |||||
| export MS_SCHED_PORT=$7 | |||||
| export MS_ROLE=MS_SCHED | |||||
| for((i=0;i<1;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/sched_$i/ | |||||
| mkdir ${execute_path}/sched_$i/ | |||||
| cd ${execute_path}/sched_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/worker_$i/ | |||||
| mkdir ${execute_path}/worker_$i/ | |||||
| cd ${execute_path}/worker_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & | |||||
| done | |||||
| @@ -14,17 +14,17 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET | |||||
| # LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM | |||||
| # SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE | |||||
| execute_path=$(pwd) | execute_path=$(pwd) | ||||
| script_self=$(readlink -f "$0") | script_self=$(readlink -f "$0") | ||||
| self_path=$(dirname "${script_self}") | self_path=$(dirname "${script_self}") | ||||
| #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE | |||||
| # LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM | |||||
| # SCHED_HOST SCHED_PORT ROLE | |||||
| export RANK_SIZE=$1 | export RANK_SIZE=$1 | ||||
| export EPOCH_SIZE=$2 | export EPOCH_SIZE=$2 | ||||
| export DATASET=$3 | |||||
| export RANK_TABLE_FILE=$4 | |||||
| export DEVICE_TARGET=$3 | |||||
| export DATASET=$4 | |||||
| export MS_COMM_TYPE=zmq | export MS_COMM_TYPE=zmq | ||||
| export MS_SCHED_NUM=1 | export MS_SCHED_NUM=1 | ||||
| @@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7 | |||||
| export MS_SCHED_HOST=$8 | export MS_SCHED_HOST=$8 | ||||
| export MS_SCHED_PORT=$9 | export MS_SCHED_PORT=$9 | ||||
| export MS_ROLE=${10} | export MS_ROLE=${10} | ||||
| echo "=====Role is $MS_ROLE======" | |||||
| export RANK_TABLE_FILE=${11} | |||||
| export VOCAB_CACHE_SIZE=${12} | |||||
| if [[ ! -n "${12}" ]]; then | |||||
| export VOCAB_CACHE_SIZE=0 | |||||
| fi | |||||
| echo "=====Role is $MS_ROLE======" | |||||
| if [ "$MS_ROLE" == "MS_SCHED" ];then | |||||
| for((i=0;i<1;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/sched_$i/ | |||||
| mkdir ${execute_path}/sched_$i/ | |||||
| cd ${execute_path}/sched_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & | |||||
| done | |||||
| if [[ "$MS_ROLE" == "MS_SCHED" ]]; then | |||||
| rm -rf ${execute_path}/sched/ | |||||
| mkdir ${execute_path}/sched/ | |||||
| cd ${execute_path}/sched/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ | |||||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||||
| fi | fi | ||||
| if [ "$MS_ROLE" == "MS_PSERVER" ];then | |||||
| for((i=0;i<$LOCAL_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & | |||||
| done | |||||
| if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then | |||||
| for((i=0;i<$LOCAL_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ | |||||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||||
| done | |||||
| fi | fi | ||||
| if [ "$MS_ROLE" == "MS_WORKER" ];then | |||||
| for((i=0;i<$LOCAL_WORKER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/worker_$i/ | |||||
| mkdir ${execute_path}/worker_$i/ | |||||
| cd ${execute_path}/worker_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & | |||||
| done | |||||
| if [[ "$MS_ROLE" == "MS_WORKER" ]]; then | |||||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||||
| rm -rf ${execute_path}/worker/ | |||||
| mkdir ${execute_path}/worker/ | |||||
| cd ${execute_path}/worker/ || exit | |||||
| mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \ | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||||
| else | |||||
| for((i=0;i<$LOCAL_WORKER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/worker_$i/ | |||||
| mkdir ${execute_path}/worker_$i/ | |||||
| cd ${execute_path}/worker_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & | |||||
| done | |||||
| fi | |||||
| fi | fi | ||||
| @@ -0,0 +1,82 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| #bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET | |||||
| # SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE | |||||
| # VOCAB_CACHE_SIZE | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export RANK_SIZE=$1 | |||||
| export EPOCH_SIZE=$2 | |||||
| export DEVICE_TARGET=$3 | |||||
| export DATASET=$4 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=$RANK_SIZE | |||||
| export MS_SERVER_NUM=$5 | |||||
| export MS_SCHED_HOST=$6 | |||||
| export MS_SCHED_PORT=$7 | |||||
| export RANK_TABLE_FILE=$8 | |||||
| export VOCAB_CACHE_SIZE=$9 | |||||
| if [[ ! -n "$9" ]]; then | |||||
| export VOCAB_CACHE_SIZE=0 | |||||
| fi | |||||
| export MS_ROLE=MS_SCHED | |||||
| rm -rf ${execute_path}/sched/ | |||||
| mkdir ${execute_path}/sched/ | |||||
| cd ${execute_path}/sched/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||||
| rm -rf ${execute_path}/worker/ | |||||
| mkdir ${execute_path}/worker/ | |||||
| cd ${execute_path}/worker/ || exit | |||||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||||
| else | |||||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/worker_$i/ | |||||
| mkdir ${execute_path}/worker_$i/ | |||||
| cd ${execute_path}/worker_$i/ || exit | |||||
| export RANK_ID=$i | |||||
| export DEVICE_ID=$i | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & | |||||
| done | |||||
| fi | |||||
| @@ -0,0 +1,79 @@ | |||||
| #!/bin/bash | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| #bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST | |||||
| # SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE | |||||
| execute_path=$(pwd) | |||||
| script_self=$(readlink -f "$0") | |||||
| self_path=$(dirname "${script_self}") | |||||
| export EPOCH_SIZE=$1 | |||||
| export DEVICE_TARGET=$2 | |||||
| export DATASET=$3 | |||||
| export MS_COMM_TYPE=zmq | |||||
| export MS_SCHED_NUM=1 | |||||
| export MS_WORKER_NUM=1 | |||||
| export MS_SERVER_NUM=$4 | |||||
| export MS_SCHED_HOST=$5 | |||||
| export MS_SCHED_PORT=$6 | |||||
| DEVICE_ID=$7 | |||||
| export VOCAB_CACHE_SIZE=$8 | |||||
| if [[ ! -n "$8" ]]; then | |||||
| export VOCAB_CACHE_SIZE=0 | |||||
| fi | |||||
| # Set device id | |||||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||||
| if [[ ! -n "$DEVICE_ID" ]]; then | |||||
| export CUDA_VISIBLE_DEVICES=0 | |||||
| else | |||||
| export CUDA_VISIBLE_DEVICES=$DEVICE_ID | |||||
| fi | |||||
| else | |||||
| if [[ ! -n "$DEVICE_ID" ]]; then | |||||
| export DEVICE_ID=0 | |||||
| else | |||||
| export DEVICE_ID=$DEVICE_ID | |||||
| fi | |||||
| fi | |||||
| export MS_ROLE=MS_SCHED | |||||
| rm -rf ${execute_path}/sched/ | |||||
| mkdir ${execute_path}/sched/ | |||||
| cd ${execute_path}/sched/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||||
| export MS_ROLE=MS_PSERVER | |||||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||||
| do | |||||
| rm -rf ${execute_path}/server_$i/ | |||||
| mkdir ${execute_path}/server_$i/ | |||||
| cd ${execute_path}/server_$i/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||||
| done | |||||
| export MS_ROLE=MS_WORKER | |||||
| rm -rf ${execute_path}/worker/ | |||||
| mkdir ${execute_path}/worker/ | |||||
| cd ${execute_path}/worker/ || exit | |||||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||||
| @@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell): | |||||
| elif parameter_server: | elif parameter_server: | ||||
| cache_enable = self.vocab_cache_size > 0 | cache_enable = self.vocab_cache_size > 0 | ||||
| target = 'DEVICE' if cache_enable else 'CPU' | target = 'DEVICE' if cache_enable else 'CPU' | ||||
| if not cache_enable: | |||||
| sparse = True | |||||
| if is_auto_parallel and config.full_batch and cache_enable: | if is_auto_parallel and config.full_batch and cache_enable: | ||||
| self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target, | self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target, | ||||
| slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, | slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, | ||||
| @@ -12,7 +12,7 @@ | |||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| """train_multinpu.""" | |||||
| """train distribute on parameter server.""" | |||||
| import os | import os | ||||
| @@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni | |||||
| from mindspore.context import ParallelMode | from mindspore.context import ParallelMode | ||||
| from mindspore.communication.management import get_rank, get_group_size, init | from mindspore.communication.management import get_rank, get_group_size, init | ||||
| from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple | from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple | ||||
| from mindspore.common import set_seed | |||||
| from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | ||||
| from src.callbacks import LossCallBack, EvalCallBack | from src.callbacks import LossCallBack, EvalCallBack | ||||
| @@ -32,18 +33,19 @@ from src.config import WideDeepConfig | |||||
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||
| def get_WideDeep_net(config): | |||||
| def get_wide_deep_net(config): | |||||
| """ | """ | ||||
| Get network of wide&deep model. | Get network of wide&deep model. | ||||
| """ | """ | ||||
| WideDeep_net = WideDeepModel(config) | |||||
| loss_net = NetWithLossClass(WideDeep_net, config) | |||||
| loss_net = VirtualDatasetCellTriple(loss_net) | |||||
| wide_deep_net = WideDeepModel(config) | |||||
| loss_net = NetWithLossClass(wide_deep_net, config) | |||||
| if cache_enable: | |||||
| loss_net = VirtualDatasetCellTriple(loss_net) | |||||
| train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | ||||
| cache_enable=bool(config.vocab_cache_size > 0)) | |||||
| eval_net = PredictWithSigmoid(WideDeep_net) | |||||
| eval_net = VirtualDatasetCellTriple(eval_net) | |||||
| cache_enable=(config.vocab_cache_size > 0)) | |||||
| eval_net = PredictWithSigmoid(wide_deep_net) | |||||
| if cache_enable: | |||||
| eval_net = VirtualDatasetCellTriple(eval_net) | |||||
| return train_net, eval_net | return train_net, eval_net | ||||
| @@ -51,7 +53,6 @@ class ModelBuilder(): | |||||
| """ | """ | ||||
| ModelBuilder | ModelBuilder | ||||
| """ | """ | ||||
| def __init__(self): | def __init__(self): | ||||
| pass | pass | ||||
| @@ -67,13 +68,14 @@ class ModelBuilder(): | |||||
| return hooks | return hooks | ||||
| def get_net(self, config): | def get_net(self, config): | ||||
| return get_WideDeep_net(config) | |||||
| return get_wide_deep_net(config) | |||||
| def train_and_eval(config): | def train_and_eval(config): | ||||
| """ | """ | ||||
| test_train_eval | test_train_eval | ||||
| """ | """ | ||||
| set_seed(1000) | |||||
| data_path = config.data_path | data_path = config.data_path | ||||
| batch_size = config.batch_size | batch_size = config.batch_size | ||||
| epochs = config.epochs | epochs = config.epochs | ||||
| @@ -83,6 +85,9 @@ def train_and_eval(config): | |||||
| dataset_type = DataType.MINDRECORD | dataset_type = DataType.MINDRECORD | ||||
| else: | else: | ||||
| dataset_type = DataType.H5 | dataset_type = DataType.H5 | ||||
| parameter_server = bool(config.parameter_server) | |||||
| if cache_enable: | |||||
| config.full_batch = True | |||||
| print("epochs is {}".format(epochs)) | print("epochs is {}".format(epochs)) | ||||
| if config.full_batch: | if config.full_batch: | ||||
| context.set_auto_parallel_context(full_batch=True) | context.set_auto_parallel_context(full_batch=True) | ||||
| @@ -107,35 +112,46 @@ def train_and_eval(config): | |||||
| train_net.set_train() | train_net.set_train() | ||||
| auc_metric = AUCMetric() | auc_metric = AUCMetric() | ||||
| model = Model(train_net, eval_network=eval_net, | |||||
| metrics={"auc": auc_metric}) | |||||
| model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) | |||||
| eval_callback = EvalCallBack( | |||||
| model, ds_eval, auc_metric, config) | |||||
| eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) | |||||
| callback = LossCallBack(config=config, per_print_times=20) | |||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, | |||||
| keep_checkpoint_max=5, integrated_save=False) | |||||
| callback = LossCallBack(config=config) | |||||
| if cache_enable: | |||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, | |||||
| keep_checkpoint_max=5, integrated_save=False) | |||||
| else: | |||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) | |||||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | ||||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) | |||||
| context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) | |||||
| callback_list = [TimeMonitor( | |||||
| ds_train.get_dataset_size()), eval_callback, callback] | |||||
| callback_list.append(ckpoint_cb) | |||||
| model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True) | |||||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', | |||||
| config=ckptconfig) | |||||
| if cache_enable: | |||||
| context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) | |||||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] | |||||
| if get_rank() == 0: | |||||
| callback_list.append(ckpoint_cb) | |||||
| model.train(epochs, ds_train, | |||||
| callbacks=callback_list, | |||||
| dataset_sink_mode=bool(parameter_server and cache_enable)) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| wide_deep_config = WideDeepConfig() | wide_deep_config = WideDeepConfig() | ||||
| wide_deep_config.argparse_init() | wide_deep_config.argparse_init() | ||||
| context.set_context(mode=context.GRAPH_MODE, | |||||
| device_target=wide_deep_config.device_target, save_graphs=True) | |||||
| context.set_context(variable_memory_max_size="24GB") | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) | |||||
| cache_enable = wide_deep_config.vocab_cache_size > 0 | |||||
| if cache_enable and wide_deep_config.device_target != "GPU": | |||||
| context.set_context(variable_memory_max_size="24GB") | |||||
| context.set_context(enable_sparse=True) | context.set_context(enable_sparse=True) | ||||
| context.set_ps_context(enable_ps=True) | context.set_ps_context(enable_ps=True) | ||||
| init() | init() | ||||
| context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) | context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) | ||||
| context.set_auto_parallel_context( | |||||
| parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) | |||||
| if cache_enable: | |||||
| context.set_auto_parallel_context( | |||||
| parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) | |||||
| else: | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | |||||
| device_num=get_group_size()) | |||||
| train_and_eval(wide_deep_config) | train_and_eval(wide_deep_config) | ||||
| @@ -12,15 +12,13 @@ | |||||
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| """train_multinpu.""" | |||||
| """train standalone on parameter server.""" | |||||
| import os | import os | ||||
| import sys | import sys | ||||
| from mindspore import Model, context | from mindspore import Model, context | ||||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | ||||
| from mindspore.context import ParallelMode | |||||
| from mindspore.communication.management import get_rank, get_group_size, init | |||||
| from mindspore.common import set_seed | from mindspore.common import set_seed | ||||
| from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | ||||
| @@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |||||
| context.set_context(enable_sparse=True) | context.set_context(enable_sparse=True) | ||||
| def get_WideDeep_net(config): | |||||
| def get_wide_deep_net(config): | |||||
| """ | """ | ||||
| Get network of wide&deep model. | Get network of wide&deep model. | ||||
| """ | """ | ||||
| WideDeep_net = WideDeepModel(config) | |||||
| loss_net = NetWithLossClass(WideDeep_net, config) | |||||
| wide_deep_net = WideDeepModel(config) | |||||
| loss_net = NetWithLossClass(wide_deep_net, config) | |||||
| train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | ||||
| cache_enable=bool(config.vocab_cache_size > 0)) | |||||
| eval_net = PredictWithSigmoid(WideDeep_net) | |||||
| cache_enable=(config.vocab_cache_size > 0)) | |||||
| eval_net = PredictWithSigmoid(wide_deep_net) | |||||
| return train_net, eval_net | return train_net, eval_net | ||||
| @@ -64,7 +62,7 @@ class ModelBuilder(): | |||||
| return hooks | return hooks | ||||
| def get_net(self, config): | def get_net(self, config): | ||||
| return get_WideDeep_net(config) | |||||
| return get_wide_deep_net(config) | |||||
| def train_and_eval(config): | def train_and_eval(config): | ||||
| @@ -82,14 +80,12 @@ def train_and_eval(config): | |||||
| else: | else: | ||||
| dataset_type = DataType.H5 | dataset_type = DataType.H5 | ||||
| parameter_server = bool(config.parameter_server) | parameter_server = bool(config.parameter_server) | ||||
| cache_enable = bool(config.vocab_cache_size > 0) | |||||
| cache_enable = config.vocab_cache_size > 0 | |||||
| print("epochs is {}".format(epochs)) | print("epochs is {}".format(epochs)) | ||||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ds_train = create_dataset(data_path, train_mode=True, epochs=1, | ||||
| batch_size=batch_size, rank_id=get_rank(), | |||||
| rank_size=get_group_size(), data_type=dataset_type) | |||||
| batch_size=batch_size, data_type=dataset_type) | |||||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | ||||
| batch_size=batch_size, rank_id=get_rank(), | |||||
| rank_size=get_group_size(), data_type=dataset_type) | |||||
| batch_size=batch_size, data_type=dataset_type) | |||||
| print("ds_train.size: {}".format(ds_train.get_dataset_size())) | print("ds_train.size: {}".format(ds_train.get_dataset_size())) | ||||
| print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) | print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) | ||||
| @@ -102,15 +98,11 @@ def train_and_eval(config): | |||||
| model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) | model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) | ||||
| eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) | eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) | ||||
| callback = LossCallBack(config=config) | callback = LossCallBack(config=config) | ||||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) | ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) | ||||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | |||||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', | |||||
| config=ckptconfig) | |||||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] | |||||
| if get_rank() == 0: | |||||
| callback_list.append(ckpoint_cb) | |||||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) | |||||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] | |||||
| model.train(epochs, ds_train, | model.train(epochs, ds_train, | ||||
| callbacks=callback_list, | callbacks=callback_list, | ||||
| dataset_sink_mode=(parameter_server and cache_enable)) | dataset_sink_mode=(parameter_server and cache_enable)) | ||||
| @@ -120,10 +112,7 @@ if __name__ == "__main__": | |||||
| wide_deep_config = WideDeepConfig() | wide_deep_config = WideDeepConfig() | ||||
| wide_deep_config.argparse_init() | wide_deep_config.argparse_init() | ||||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target) | |||||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) | |||||
| context.set_ps_context(enable_ps=True) | context.set_ps_context(enable_ps=True) | ||||
| init() | |||||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | |||||
| device_num=get_group_size()) | |||||
| train_and_eval(wide_deep_config) | train_and_eval(wide_deep_config) | ||||