| @@ -1,58 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$4 | |||
| export MS_SCHED_HOST=$5 | |||
| export MS_SCHED_PORT=$6 | |||
| export MS_ROLE=MS_SCHED | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 & | |||
| @@ -1,68 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export RANK_TABLE_FILE=$4 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$5 | |||
| export MS_SCHED_HOST=$6 | |||
| export MS_SCHED_PORT=$7 | |||
| export MS_ROLE=MS_SCHED | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 & | |||
| done | |||
| @@ -1,56 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export EPOCH_SIZE=$1 | |||
| export DEVICE_TARGET=$2 | |||
| export DATASET=$3 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=1 | |||
| export MS_SERVER_NUM=$4 | |||
| export MS_SCHED_HOST=$5 | |||
| export MS_SCHED_PORT=$6 | |||
| export MS_ROLE=MS_SCHED | |||
| rm -rf ${execute_path}/sched/ | |||
| mkdir ${execute_path}/sched/ | |||
| cd ${execute_path}/sched/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 & | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 \ | |||
| --dropout_flag=1 >worker.log 2>&1 & | |||
| @@ -1,63 +0,0 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export RANK_TABLE_FILE=$4 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$5 | |||
| export MS_SCHED_HOST=$6 | |||
| export MS_SCHED_PORT=$7 | |||
| export MS_ROLE=MS_SCHED | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & | |||
| done | |||
| @@ -14,17 +14,17 @@ | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET | |||
| # LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM | |||
| # SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE | |||
| # LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM | |||
| # SCHED_HOST SCHED_PORT ROLE | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export RANK_TABLE_FILE=$4 | |||
| export DEVICE_TARGET=$3 | |||
| export DATASET=$4 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| @@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7 | |||
| export MS_SCHED_HOST=$8 | |||
| export MS_SCHED_PORT=$9 | |||
| export MS_ROLE=${10} | |||
| echo "=====Role is $MS_ROLE======" | |||
| export RANK_TABLE_FILE=${11} | |||
| export VOCAB_CACHE_SIZE=${12} | |||
| if [[ ! -n "${12}" ]]; then | |||
| export VOCAB_CACHE_SIZE=0 | |||
| fi | |||
| echo "=====Role is $MS_ROLE======" | |||
| if [ "$MS_ROLE" == "MS_SCHED" ];then | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 & | |||
| done | |||
| if [[ "$MS_ROLE" == "MS_SCHED" ]]; then | |||
| rm -rf ${execute_path}/sched/ | |||
| mkdir ${execute_path}/sched/ | |||
| cd ${execute_path}/sched/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||
| fi | |||
| if [ "$MS_ROLE" == "MS_PSERVER" ];then | |||
| for((i=0;i<$LOCAL_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 & | |||
| done | |||
| if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then | |||
| for((i=0;i<$LOCAL_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||
| done | |||
| fi | |||
| if [ "$MS_ROLE" == "MS_WORKER" ];then | |||
| for((i=0;i<$LOCAL_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 & | |||
| done | |||
| if [[ "$MS_ROLE" == "MS_WORKER" ]]; then | |||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \ | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||
| else | |||
| for((i=0;i<$LOCAL_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & | |||
| done | |||
| fi | |||
| fi | |||
| @@ -0,0 +1,82 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| #bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET | |||
| # SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE | |||
| # VOCAB_CACHE_SIZE | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DEVICE_TARGET=$3 | |||
| export DATASET=$4 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$5 | |||
| export MS_SCHED_HOST=$6 | |||
| export MS_SCHED_PORT=$7 | |||
| export RANK_TABLE_FILE=$8 | |||
| export VOCAB_CACHE_SIZE=$9 | |||
| if [[ ! -n "$9" ]]; then | |||
| export VOCAB_CACHE_SIZE=0 | |||
| fi | |||
| export MS_ROLE=MS_SCHED | |||
| rm -rf ${execute_path}/sched/ | |||
| mkdir ${execute_path}/sched/ | |||
| cd ${execute_path}/sched/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| mpirun --allow-run-as-root -n $RANK_SIZE \ | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||
| else | |||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \ | |||
| --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 & | |||
| done | |||
| fi | |||
| @@ -0,0 +1,79 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| #bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST | |||
| # SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export EPOCH_SIZE=$1 | |||
| export DEVICE_TARGET=$2 | |||
| export DATASET=$3 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=1 | |||
| export MS_SERVER_NUM=$4 | |||
| export MS_SCHED_HOST=$5 | |||
| export MS_SCHED_PORT=$6 | |||
| DEVICE_ID=$7 | |||
| export VOCAB_CACHE_SIZE=$8 | |||
| if [[ ! -n "$8" ]]; then | |||
| export VOCAB_CACHE_SIZE=0 | |||
| fi | |||
| # Set device id | |||
| if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then | |||
| if [[ ! -n "$DEVICE_ID" ]]; then | |||
| export CUDA_VISIBLE_DEVICES=0 | |||
| else | |||
| export CUDA_VISIBLE_DEVICES=$DEVICE_ID | |||
| fi | |||
| else | |||
| if [[ ! -n "$DEVICE_ID" ]]; then | |||
| export DEVICE_ID=0 | |||
| else | |||
| export DEVICE_ID=$DEVICE_ID | |||
| fi | |||
| fi | |||
| export MS_ROLE=MS_SCHED | |||
| rm -rf ${execute_path}/sched/ | |||
| mkdir ${execute_path}/sched/ | |||
| cd ${execute_path}/sched/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 & | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \ | |||
| --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \ | |||
| --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 & | |||
| @@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell): | |||
| elif parameter_server: | |||
| cache_enable = self.vocab_cache_size > 0 | |||
| target = 'DEVICE' if cache_enable else 'CPU' | |||
| if not cache_enable: | |||
| sparse = True | |||
| if is_auto_parallel and config.full_batch and cache_enable: | |||
| self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target, | |||
| slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, | |||
| @@ -12,7 +12,7 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """train_multinpu.""" | |||
| """train distribute on parameter server.""" | |||
| import os | |||
| @@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import get_rank, get_group_size, init | |||
| from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple | |||
| from mindspore.common import set_seed | |||
| from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | |||
| from src.callbacks import LossCallBack, EvalCallBack | |||
| @@ -32,18 +33,19 @@ from src.config import WideDeepConfig | |||
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |||
| def get_WideDeep_net(config): | |||
| def get_wide_deep_net(config): | |||
| """ | |||
| Get network of wide&deep model. | |||
| """ | |||
| WideDeep_net = WideDeepModel(config) | |||
| loss_net = NetWithLossClass(WideDeep_net, config) | |||
| loss_net = VirtualDatasetCellTriple(loss_net) | |||
| wide_deep_net = WideDeepModel(config) | |||
| loss_net = NetWithLossClass(wide_deep_net, config) | |||
| if cache_enable: | |||
| loss_net = VirtualDatasetCellTriple(loss_net) | |||
| train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | |||
| cache_enable=bool(config.vocab_cache_size > 0)) | |||
| eval_net = PredictWithSigmoid(WideDeep_net) | |||
| eval_net = VirtualDatasetCellTriple(eval_net) | |||
| cache_enable=(config.vocab_cache_size > 0)) | |||
| eval_net = PredictWithSigmoid(wide_deep_net) | |||
| if cache_enable: | |||
| eval_net = VirtualDatasetCellTriple(eval_net) | |||
| return train_net, eval_net | |||
| @@ -51,7 +53,6 @@ class ModelBuilder(): | |||
| """ | |||
| ModelBuilder | |||
| """ | |||
| def __init__(self): | |||
| pass | |||
| @@ -67,13 +68,14 @@ class ModelBuilder(): | |||
| return hooks | |||
| def get_net(self, config): | |||
| return get_WideDeep_net(config) | |||
| return get_wide_deep_net(config) | |||
| def train_and_eval(config): | |||
| """ | |||
| test_train_eval | |||
| """ | |||
| set_seed(1000) | |||
| data_path = config.data_path | |||
| batch_size = config.batch_size | |||
| epochs = config.epochs | |||
| @@ -83,6 +85,9 @@ def train_and_eval(config): | |||
| dataset_type = DataType.MINDRECORD | |||
| else: | |||
| dataset_type = DataType.H5 | |||
| parameter_server = bool(config.parameter_server) | |||
| if cache_enable: | |||
| config.full_batch = True | |||
| print("epochs is {}".format(epochs)) | |||
| if config.full_batch: | |||
| context.set_auto_parallel_context(full_batch=True) | |||
| @@ -107,35 +112,46 @@ def train_and_eval(config): | |||
| train_net.set_train() | |||
| auc_metric = AUCMetric() | |||
| model = Model(train_net, eval_network=eval_net, | |||
| metrics={"auc": auc_metric}) | |||
| model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) | |||
| eval_callback = EvalCallBack( | |||
| model, ds_eval, auc_metric, config) | |||
| eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) | |||
| callback = LossCallBack(config=config, per_print_times=20) | |||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, | |||
| keep_checkpoint_max=5, integrated_save=False) | |||
| callback = LossCallBack(config=config) | |||
| if cache_enable: | |||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, | |||
| keep_checkpoint_max=5, integrated_save=False) | |||
| else: | |||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) | |||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | |||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) | |||
| context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) | |||
| callback_list = [TimeMonitor( | |||
| ds_train.get_dataset_size()), eval_callback, callback] | |||
| callback_list.append(ckpoint_cb) | |||
| model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True) | |||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', | |||
| config=ckptconfig) | |||
| if cache_enable: | |||
| context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) | |||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] | |||
| if get_rank() == 0: | |||
| callback_list.append(ckpoint_cb) | |||
| model.train(epochs, ds_train, | |||
| callbacks=callback_list, | |||
| dataset_sink_mode=bool(parameter_server and cache_enable)) | |||
| if __name__ == "__main__": | |||
| wide_deep_config = WideDeepConfig() | |||
| wide_deep_config.argparse_init() | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| device_target=wide_deep_config.device_target, save_graphs=True) | |||
| context.set_context(variable_memory_max_size="24GB") | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) | |||
| cache_enable = wide_deep_config.vocab_cache_size > 0 | |||
| if cache_enable and wide_deep_config.device_target != "GPU": | |||
| context.set_context(variable_memory_max_size="24GB") | |||
| context.set_context(enable_sparse=True) | |||
| context.set_ps_context(enable_ps=True) | |||
| init() | |||
| context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) | |||
| context.set_auto_parallel_context( | |||
| parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) | |||
| if cache_enable: | |||
| context.set_auto_parallel_context( | |||
| parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) | |||
| else: | |||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | |||
| device_num=get_group_size()) | |||
| train_and_eval(wide_deep_config) | |||
| @@ -12,15 +12,13 @@ | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """train_multinpu.""" | |||
| """train standalone on parameter server.""" | |||
| import os | |||
| import sys | |||
| from mindspore import Model, context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import get_rank, get_group_size, init | |||
| from mindspore.common import set_seed | |||
| from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | |||
| @@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |||
| context.set_context(enable_sparse=True) | |||
| def get_WideDeep_net(config): | |||
| def get_wide_deep_net(config): | |||
| """ | |||
| Get network of wide&deep model. | |||
| """ | |||
| WideDeep_net = WideDeepModel(config) | |||
| loss_net = NetWithLossClass(WideDeep_net, config) | |||
| wide_deep_net = WideDeepModel(config) | |||
| loss_net = NetWithLossClass(wide_deep_net, config) | |||
| train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | |||
| cache_enable=bool(config.vocab_cache_size > 0)) | |||
| eval_net = PredictWithSigmoid(WideDeep_net) | |||
| cache_enable=(config.vocab_cache_size > 0)) | |||
| eval_net = PredictWithSigmoid(wide_deep_net) | |||
| return train_net, eval_net | |||
| @@ -64,7 +62,7 @@ class ModelBuilder(): | |||
| return hooks | |||
| def get_net(self, config): | |||
| return get_WideDeep_net(config) | |||
| return get_wide_deep_net(config) | |||
| def train_and_eval(config): | |||
| @@ -82,14 +80,12 @@ def train_and_eval(config): | |||
| else: | |||
| dataset_type = DataType.H5 | |||
| parameter_server = bool(config.parameter_server) | |||
| cache_enable = bool(config.vocab_cache_size > 0) | |||
| cache_enable = config.vocab_cache_size > 0 | |||
| print("epochs is {}".format(epochs)) | |||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | |||
| batch_size=batch_size, rank_id=get_rank(), | |||
| rank_size=get_group_size(), data_type=dataset_type) | |||
| batch_size=batch_size, data_type=dataset_type) | |||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | |||
| batch_size=batch_size, rank_id=get_rank(), | |||
| rank_size=get_group_size(), data_type=dataset_type) | |||
| batch_size=batch_size, data_type=dataset_type) | |||
| print("ds_train.size: {}".format(ds_train.get_dataset_size())) | |||
| print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) | |||
| @@ -102,15 +98,11 @@ def train_and_eval(config): | |||
| model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) | |||
| eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) | |||
| callback = LossCallBack(config=config) | |||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) | |||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | |||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', | |||
| config=ckptconfig) | |||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] | |||
| if get_rank() == 0: | |||
| callback_list.append(ckpoint_cb) | |||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) | |||
| callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] | |||
| model.train(epochs, ds_train, | |||
| callbacks=callback_list, | |||
| dataset_sink_mode=(parameter_server and cache_enable)) | |||
| @@ -120,10 +112,7 @@ if __name__ == "__main__": | |||
| wide_deep_config = WideDeepConfig() | |||
| wide_deep_config.argparse_init() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) | |||
| context.set_ps_context(enable_ps=True) | |||
| init() | |||
| context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, | |||
| device_num=get_group_size()) | |||
| train_and_eval(wide_deep_config) | |||