Browse Source

refine model zoo scripts for ps cache

tags/v1.1.0
lizhenyu 5 years ago
parent
commit
b70bc5b9d0
10 changed files with 274 additions and 336 deletions
  1. +0
    -58
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh
  2. +0
    -68
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh
  3. +0
    -56
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh
  4. +0
    -63
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh
  5. +52
    -37
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh
  6. +82
    -0
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh
  7. +79
    -0
      model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh
  8. +2
    -0
      model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py
  9. +45
    -29
      model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py
  10. +14
    -25
      model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py

+ 0
- 58
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh View File

@@ -1,58 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6

export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
done

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 &

+ 0
- 68
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh View File

@@ -1,68 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7

export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >sched_$i.log 2>&1 &
done

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 &
done

+ 0
- 56
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh View File

@@ -1,56 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export EPOCH_SIZE=$1
export DEVICE_TARGET=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=1
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6

export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 &

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
--parameter_server=1 --vocab_cache_size=300000 \
--dropout_flag=1 >worker.log 2>&1 &

+ 0
- 63
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh View File

@@ -1,63 +0,0 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4

export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7

export MS_ROLE=MS_SCHED
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
done

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
done

+ 52
- 37
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh View File

@@ -14,17 +14,17 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================



#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET
# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM
# SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE
execute_path=$(pwd) execute_path=$(pwd)
script_self=$(readlink -f "$0") script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}") self_path=$(dirname "${script_self}")

#bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE
# LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM
# SCHED_HOST SCHED_PORT ROLE
export RANK_SIZE=$1 export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export DEVICE_TARGET=$3
export DATASET=$4


export MS_COMM_TYPE=zmq export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1 export MS_SCHED_NUM=1
@@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7
export MS_SCHED_HOST=$8 export MS_SCHED_HOST=$8
export MS_SCHED_PORT=$9 export MS_SCHED_PORT=$9
export MS_ROLE=${10} export MS_ROLE=${10}
echo "=====Role is $MS_ROLE======"
export RANK_TABLE_FILE=${11}
export VOCAB_CACHE_SIZE=${12}


if [[ ! -n "${12}" ]]; then
export VOCAB_CACHE_SIZE=0
fi

echo "=====Role is $MS_ROLE======"


if [ "$MS_ROLE" == "MS_SCHED" ];then
for((i=0;i<1;i++));
do
rm -rf ${execute_path}/sched_$i/
mkdir ${execute_path}/sched_$i/
cd ${execute_path}/sched_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_SCHED" ]]; then
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
fi fi


if [ "$MS_ROLE" == "MS_PSERVER" ];then
for((i=0;i<$LOCAL_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then
for((i=0;i<$LOCAL_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
--data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done
fi fi


if [ "$MS_ROLE" == "MS_WORKER" ];then
for((i=0;i<$LOCAL_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
done
if [[ "$MS_ROLE" == "MS_WORKER" ]]; then
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
else
for((i=0;i<$LOCAL_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
done
fi
fi fi

+ 82
- 0
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh View File

@@ -0,0 +1,82 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================


#bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET
# SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE
# VOCAB_CACHE_SIZE
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DEVICE_TARGET=$3
export DATASET=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=$RANK_SIZE
export MS_SERVER_NUM=$5
export MS_SCHED_HOST=$6
export MS_SCHED_PORT=$7
export RANK_TABLE_FILE=$8
export VOCAB_CACHE_SIZE=$9

if [[ ! -n "$9" ]]; then
export VOCAB_CACHE_SIZE=0
fi

export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
mpirun --allow-run-as-root -n $RANK_SIZE \
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
else
for((i=0;i<$MS_WORKER_NUM;i++));
do
rm -rf ${execute_path}/worker_$i/
mkdir ${execute_path}/worker_$i/
cd ${execute_path}/worker_$i/ || exit
export RANK_ID=$i
export DEVICE_ID=$i
python -s ${self_path}/../train_and_eval_parameter_server_distribute.py \
--device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
done
fi


+ 79
- 0
model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh View File

@@ -0,0 +1,79 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================


#bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST
# SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE
execute_path=$(pwd)
script_self=$(readlink -f "$0")
self_path=$(dirname "${script_self}")
export EPOCH_SIZE=$1
export DEVICE_TARGET=$2
export DATASET=$3
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
export MS_WORKER_NUM=1
export MS_SERVER_NUM=$4
export MS_SCHED_HOST=$5
export MS_SCHED_PORT=$6
DEVICE_ID=$7
export VOCAB_CACHE_SIZE=$8

if [[ ! -n "$8" ]]; then
export VOCAB_CACHE_SIZE=0
fi

# Set device id
if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
if [[ ! -n "$DEVICE_ID" ]]; then
export CUDA_VISIBLE_DEVICES=0
else
export CUDA_VISIBLE_DEVICES=$DEVICE_ID
fi
else
if [[ ! -n "$DEVICE_ID" ]]; then
export DEVICE_ID=0
else
export DEVICE_ID=$DEVICE_ID
fi
fi

export MS_ROLE=MS_SCHED
rm -rf ${execute_path}/sched/
mkdir ${execute_path}/sched/
cd ${execute_path}/sched/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &

export MS_ROLE=MS_PSERVER
for((i=0;i<$MS_SERVER_NUM;i++));
do
rm -rf ${execute_path}/server_$i/
mkdir ${execute_path}/server_$i/
cd ${execute_path}/server_$i/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
done

export MS_ROLE=MS_WORKER
rm -rf ${execute_path}/worker/
mkdir ${execute_path}/worker/
cd ${execute_path}/worker/ || exit
python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET \
--epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1 \
--vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &

+ 2
- 0
model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py View File

@@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell):
elif parameter_server: elif parameter_server:
cache_enable = self.vocab_cache_size > 0 cache_enable = self.vocab_cache_size > 0
target = 'DEVICE' if cache_enable else 'CPU' target = 'DEVICE' if cache_enable else 'CPU'
if not cache_enable:
sparse = True
if is_auto_parallel and config.full_batch and cache_enable: if is_auto_parallel and config.full_batch and cache_enable:
self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target, self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target,
slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,


model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py → model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py View File

@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
"""train_multinpu."""
"""train distribute on parameter server."""




import os import os
@@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni
from mindspore.context import ParallelMode from mindspore.context import ParallelMode
from mindspore.communication.management import get_rank, get_group_size, init from mindspore.communication.management import get_rank, get_group_size, init
from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
from mindspore.common import set_seed


from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
from src.callbacks import LossCallBack, EvalCallBack from src.callbacks import LossCallBack, EvalCallBack
@@ -32,18 +33,19 @@ from src.config import WideDeepConfig


sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))



def get_WideDeep_net(config):
def get_wide_deep_net(config):
""" """
Get network of wide&deep model. Get network of wide&deep model.
""" """
WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config)
loss_net = VirtualDatasetCellTriple(loss_net)
wide_deep_net = WideDeepModel(config)
loss_net = NetWithLossClass(wide_deep_net, config)
if cache_enable:
loss_net = VirtualDatasetCellTriple(loss_net)
train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
cache_enable=bool(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(WideDeep_net)
eval_net = VirtualDatasetCellTriple(eval_net)
cache_enable=(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(wide_deep_net)
if cache_enable:
eval_net = VirtualDatasetCellTriple(eval_net)
return train_net, eval_net return train_net, eval_net




@@ -51,7 +53,6 @@ class ModelBuilder():
""" """
ModelBuilder ModelBuilder
""" """

def __init__(self): def __init__(self):
pass pass


@@ -67,13 +68,14 @@ class ModelBuilder():
return hooks return hooks


def get_net(self, config): def get_net(self, config):
return get_WideDeep_net(config)
return get_wide_deep_net(config)




def train_and_eval(config): def train_and_eval(config):
""" """
test_train_eval test_train_eval
""" """
set_seed(1000)
data_path = config.data_path data_path = config.data_path
batch_size = config.batch_size batch_size = config.batch_size
epochs = config.epochs epochs = config.epochs
@@ -83,6 +85,9 @@ def train_and_eval(config):
dataset_type = DataType.MINDRECORD dataset_type = DataType.MINDRECORD
else: else:
dataset_type = DataType.H5 dataset_type = DataType.H5
parameter_server = bool(config.parameter_server)
if cache_enable:
config.full_batch = True
print("epochs is {}".format(epochs)) print("epochs is {}".format(epochs))
if config.full_batch: if config.full_batch:
context.set_auto_parallel_context(full_batch=True) context.set_auto_parallel_context(full_batch=True)
@@ -107,35 +112,46 @@ def train_and_eval(config):
train_net.set_train() train_net.set_train()
auc_metric = AUCMetric() auc_metric = AUCMetric()


model = Model(train_net, eval_network=eval_net,
metrics={"auc": auc_metric})
model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})


eval_callback = EvalCallBack(
model, ds_eval, auc_metric, config)
eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)


callback = LossCallBack(config=config, per_print_times=20)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
keep_checkpoint_max=5, integrated_save=False)
callback = LossCallBack(config=config)
if cache_enable:
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
keep_checkpoint_max=5, integrated_save=False)
else:
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
callback_list = [TimeMonitor(
ds_train.get_dataset_size()), eval_callback, callback]
callback_list.append(ckpoint_cb)
model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True)
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
if cache_enable:
context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if get_rank() == 0:
callback_list.append(ckpoint_cb)
model.train(epochs, ds_train,
callbacks=callback_list,
dataset_sink_mode=bool(parameter_server and cache_enable))




if __name__ == "__main__": if __name__ == "__main__":
wide_deep_config = WideDeepConfig() wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init() wide_deep_config.argparse_init()
context.set_context(mode=context.GRAPH_MODE,
device_target=wide_deep_config.device_target, save_graphs=True)
context.set_context(variable_memory_max_size="24GB")
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
cache_enable = wide_deep_config.vocab_cache_size > 0
if cache_enable and wide_deep_config.device_target != "GPU":
context.set_context(variable_memory_max_size="24GB")
context.set_context(enable_sparse=True) context.set_context(enable_sparse=True)
context.set_ps_context(enable_ps=True) context.set_ps_context(enable_ps=True)
init() init()
context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank()))


context.set_auto_parallel_context(
parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
if cache_enable:
context.set_auto_parallel_context(
parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
else:
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=get_group_size())

train_and_eval(wide_deep_config) train_and_eval(wide_deep_config)

model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server.py → model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py View File

@@ -12,15 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
"""train_multinpu."""
"""train standalone on parameter server."""




import os import os
import sys import sys
from mindspore import Model, context from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.context import ParallelMode
from mindspore.communication.management import get_rank, get_group_size, init
from mindspore.common import set_seed from mindspore.common import set_seed


from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
@@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
context.set_context(enable_sparse=True) context.set_context(enable_sparse=True)




def get_WideDeep_net(config):
def get_wide_deep_net(config):
""" """
Get network of wide&deep model. Get network of wide&deep model.
""" """
WideDeep_net = WideDeepModel(config)
loss_net = NetWithLossClass(WideDeep_net, config)
wide_deep_net = WideDeepModel(config)
loss_net = NetWithLossClass(wide_deep_net, config)
train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
cache_enable=bool(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(WideDeep_net)
cache_enable=(config.vocab_cache_size > 0))
eval_net = PredictWithSigmoid(wide_deep_net)
return train_net, eval_net return train_net, eval_net




@@ -64,7 +62,7 @@ class ModelBuilder():
return hooks return hooks


def get_net(self, config): def get_net(self, config):
return get_WideDeep_net(config)
return get_wide_deep_net(config)




def train_and_eval(config): def train_and_eval(config):
@@ -82,14 +80,12 @@ def train_and_eval(config):
else: else:
dataset_type = DataType.H5 dataset_type = DataType.H5
parameter_server = bool(config.parameter_server) parameter_server = bool(config.parameter_server)
cache_enable = bool(config.vocab_cache_size > 0)
cache_enable = config.vocab_cache_size > 0
print("epochs is {}".format(epochs)) print("epochs is {}".format(epochs))
ds_train = create_dataset(data_path, train_mode=True, epochs=1, ds_train = create_dataset(data_path, train_mode=True, epochs=1,
batch_size=batch_size, rank_id=get_rank(),
rank_size=get_group_size(), data_type=dataset_type)
batch_size=batch_size, data_type=dataset_type)
ds_eval = create_dataset(data_path, train_mode=False, epochs=1, ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
batch_size=batch_size, rank_id=get_rank(),
rank_size=get_group_size(), data_type=dataset_type)
batch_size=batch_size, data_type=dataset_type)
print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_train.size: {}".format(ds_train.get_dataset_size()))
print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))


@@ -102,15 +98,11 @@ def train_and_eval(config):
model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})


eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

callback = LossCallBack(config=config) callback = LossCallBack(config=config)
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
if get_rank() == 0:
callback_list.append(ckpoint_cb)
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]

model.train(epochs, ds_train, model.train(epochs, ds_train,
callbacks=callback_list, callbacks=callback_list,
dataset_sink_mode=(parameter_server and cache_enable)) dataset_sink_mode=(parameter_server and cache_enable))
@@ -120,10 +112,7 @@ if __name__ == "__main__":
wide_deep_config = WideDeepConfig() wide_deep_config = WideDeepConfig()
wide_deep_config.argparse_init() wide_deep_config.argparse_init()


context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
context.set_ps_context(enable_ps=True) context.set_ps_context(enable_ps=True)
init()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
device_num=get_group_size())


train_and_eval(wide_deep_config) train_and_eval(wide_deep_config)

Loading…
Cancel
Save