refine model zoo scripts for ps cache

5 years ago · b70bc5b9d0
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multigpu_train.sh
@@ -1,58 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
 export DATASET=$3
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=$RANK_SIZE
 export MS_SERVER_NUM=$4
 export MS_SCHED_HOST=$5
 export MS_SCHED_PORT=$6

 export MS_ROLE=MS_SCHED
 for((i=0;i<1;i++));
 do
  rm -rf ${execute_path}/sched_$i/
  mkdir ${execute_path}/sched_$i/
  cd ${execute_path}/sched_$i/ || exit
  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py                 \
         --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
         --vocab_cache_size=300000 >sched_$i.log 2>&1 &
 done

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py                 \
         --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
         --vocab_cache_size=300000 >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 rm -rf ${execute_path}/worker/
 mkdir ${execute_path}/worker/
 cd ${execute_path}/worker/ || exit
 mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py               \
                                                --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
                                                --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_multinpu_train.sh
@@ -1,68 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
 export DATASET=$3
 export RANK_TABLE_FILE=$4
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=$RANK_SIZE
 export MS_SERVER_NUM=$5
 export MS_SCHED_HOST=$6
 export MS_SCHED_PORT=$7

 export MS_ROLE=MS_SCHED
 for((i=0;i<1;i++));
 do
  rm -rf ${execute_path}/sched_$i/
  mkdir ${execute_path}/sched_$i/
  cd ${execute_path}/sched_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
         --vocab_cache_size=300000 >sched_$i.log 2>&1 &
 done

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
         --vocab_cache_size=300000 >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \
         --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1         \
         --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 &
 done
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_cache_standalone_train.sh
@@ -1,56 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export EPOCH_SIZE=$1
 export DEVICE_TARGET=$2
 export DATASET=$3
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=1
 export MS_SERVER_NUM=$4
 export MS_SCHED_HOST=$5
 export MS_SCHED_PORT=$6

 export MS_ROLE=MS_SCHED
 rm -rf ${execute_path}/sched/
 mkdir ${execute_path}/sched/
 cd ${execute_path}/sched/ || exit
 export DEVICE_ID=$i
 python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
                                                             --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 &

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
                                                               --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 rm -rf ${execute_path}/worker/
 mkdir ${execute_path}/worker/
 cd ${execute_path}/worker/ || exit
 export DEVICE_ID=$i
 python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \
                                                             --parameter_server=1 --vocab_cache_size=300000 \
                                                             --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train.sh
@@ -1,63 +0,0 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================

 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
 export DATASET=$3
 export RANK_TABLE_FILE=$4

 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=$RANK_SIZE
 export MS_SERVER_NUM=$5
 export MS_SCHED_HOST=$6
 export MS_SCHED_PORT=$7

 export MS_ROLE=MS_SCHED
 for((i=0;i<1;i++));
 do
  rm -rf ${execute_path}/sched_$i/
  mkdir ${execute_path}/sched_$i/
  cd ${execute_path}/sched_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
 done

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 for((i=0;i<$MS_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
 done
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_cluster.sh
@@ -14,17 +14,17 @@
 # limitations under the License.
 # ============================================================================


 #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET 
 #                                           LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM 
 #                                           SCHED_HOST SCHED_PORT ROLE RANK_TABLE_FILE VOCAB_CACHE_SIZE 
 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")

 #bash run_parameter_server_train_cluster.sh RANK_SIZE EPOCHS DATASET RANK_TABLE_FILE 
 #                                            LOCAL_WORKER_NUM LOCAL_SERVER_NUM SERVER_NUM 
 #                                            SCHED_HOST SCHED_PORT ROLE
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
 export DATASET=$3
 export RANK_TABLE_FILE=$4
 export DEVICE_TARGET=$3
 export DATASET=$4

 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
@@ -35,41 +35,56 @@ export MS_SERVER_NUM=$7
 export MS_SCHED_HOST=$8
 export MS_SCHED_PORT=$9
 export MS_ROLE=${10}
 echo  "=====Role is $MS_ROLE======"
 export RANK_TABLE_FILE=${11}
 export VOCAB_CACHE_SIZE=${12}

 if [[ ! -n "${12}" ]]; then
  export VOCAB_CACHE_SIZE=0
 fi

 echo  "=====Role is $MS_ROLE======"

 if [ "$MS_ROLE" == "MS_SCHED" ];then
 for((i=0;i<1;i++));
 do
  rm -rf ${execute_path}/sched_$i/
  mkdir ${execute_path}/sched_$i/
  cd ${execute_path}/sched_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >sched_$i.log 2>&1 &
 done
 if [[ "$MS_ROLE" == "MS_SCHED" ]]; then
  rm -rf ${execute_path}/sched/
  mkdir ${execute_path}/sched/
  cd ${execute_path}/sched/ || exit
  python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET   \
        --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1                                     \
        --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &
 fi

 if [ "$MS_ROLE" == "MS_PSERVER" ];then
 for((i=0;i<$LOCAL_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >server_$i.log 2>&1 &
 done
 if [[ "$MS_ROLE" == "MS_PSERVER" ]]; then
  for((i=0;i<$LOCAL_SERVER_NUM;i++));
  do
    rm -rf ${execute_path}/server_$i/
    mkdir ${execute_path}/server_$i/
    cd ${execute_path}/server_$i/ || exit
    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py --device_target=$DEVICE_TARGET \
          --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1                                   \
          --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
  done
 fi

 if [ "$MS_ROLE" == "MS_WORKER" ];then
 for((i=0;i<$LOCAL_WORKER_NUM;i++));
 do
  rm -rf ${execute_path}/worker_$i/
  mkdir ${execute_path}/worker_$i/
  cd ${execute_path}/worker_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
  python -s ${self_path}/../train_and_eval_parameter_server.py --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 >worker_$i.log 2>&1 &
 done
 if [[ "$MS_ROLE" == "MS_WORKER" ]]; then
  if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
    rm -rf ${execute_path}/worker/
    mkdir ${execute_path}/worker/
    cd ${execute_path}/worker/ || exit
    mpirun --allow-run-as-root -n $LOCAL_WORKER_NUM \
      python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                    \
        --device_target=$DEVICE --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
        --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
  else
    for((i=0;i<$LOCAL_WORKER_NUM;i++));
    do
      rm -rf ${execute_path}/worker_$i/
      mkdir ${execute_path}/worker_$i/
      cd ${execute_path}/worker_$i/ || exit
      export RANK_ID=$i
      export DEVICE_ID=$i
      python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                         \
        --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
        --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
    done
  fi
 fi
--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_distribute.sh
@@ -0,0 +1,82 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================


 #bash run_parameter_server_train_distribute.sh RANK_SIZE EPOCHS DEVICE_TARGET DATASET 
 #                                              SERVER_NUM SCHED_HOST SCHED_PORT RANK_TABLE_FILE
 #                                              VOCAB_CACHE_SIZE
 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export RANK_SIZE=$1
 export EPOCH_SIZE=$2
 export DEVICE_TARGET=$3
 export DATASET=$4
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=$RANK_SIZE
 export MS_SERVER_NUM=$5
 export MS_SCHED_HOST=$6
 export MS_SCHED_PORT=$7
 export RANK_TABLE_FILE=$8
 export VOCAB_CACHE_SIZE=$9

 if [[ ! -n "$9" ]]; then
  export VOCAB_CACHE_SIZE=0
 fi

 export MS_ROLE=MS_SCHED
 rm -rf ${execute_path}/sched/
 mkdir ${execute_path}/sched/
 cd ${execute_path}/sched/ || exit
 python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                                 \
        --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
        --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                               \
         --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1  \
         --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
  rm -rf ${execute_path}/worker/
  mkdir ${execute_path}/worker/
  cd ${execute_path}/worker/ || exit
  mpirun --allow-run-as-root -n $RANK_SIZE \
    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                           \
      --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1   \
      --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
 else
  for((i=0;i<$MS_WORKER_NUM;i++));
  do
    rm -rf ${execute_path}/worker_$i/
    mkdir ${execute_path}/worker_$i/
    cd ${execute_path}/worker_$i/ || exit
    export RANK_ID=$i
    export DEVICE_ID=$i
    python -s ${self_path}/../train_and_eval_parameter_server_distribute.py                         \
      --device_target=$DEVICE_TARGET --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \
      --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker_$i.log 2>&1 &
  done
 fi

--- a/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_parameter_server_train_standalone.sh
@@ -0,0 +1,79 @@
 #!/bin/bash
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================


 #bash run_parameter_server_train_standalone.sh EPOCHS DEVICE_TARGET DATASET SERVER_NUM SCHED_HOST
 #                                              SCHED_PORT DEVICE_ID VOCAB_CACHE_SIZE
 execute_path=$(pwd)
 script_self=$(readlink -f "$0")
 self_path=$(dirname "${script_self}")
 export EPOCH_SIZE=$1
 export DEVICE_TARGET=$2
 export DATASET=$3
 export MS_COMM_TYPE=zmq
 export MS_SCHED_NUM=1
 export MS_WORKER_NUM=1
 export MS_SERVER_NUM=$4
 export MS_SCHED_HOST=$5
 export MS_SCHED_PORT=$6
 DEVICE_ID=$7
 export VOCAB_CACHE_SIZE=$8

 if [[ ! -n "$8" ]]; then
  export VOCAB_CACHE_SIZE=0
 fi

 # Set device id
 if [[ "X$DEVICE_TARGET" == "XGPU" ]]; then
  if [[ ! -n "$DEVICE_ID" ]]; then
    export CUDA_VISIBLE_DEVICES=0
  else
    export CUDA_VISIBLE_DEVICES=$DEVICE_ID
  fi
 else
  if [[ ! -n "$DEVICE_ID" ]]; then
    export DEVICE_ID=0
  else
    export DEVICE_ID=$DEVICE_ID
  fi
 fi

 export MS_ROLE=MS_SCHED
 rm -rf ${execute_path}/sched/
 mkdir ${execute_path}/sched/
 cd ${execute_path}/sched/ || exit
 python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
       --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
       --vocab_cache_size=$VOCAB_CACHE_SIZE >sched.log 2>&1 &

 export MS_ROLE=MS_PSERVER
 for((i=0;i<$MS_SERVER_NUM;i++));
 do
  rm -rf ${execute_path}/server_$i/
  mkdir ${execute_path}/server_$i/
  cd ${execute_path}/server_$i/ || exit
  python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
         --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
         --vocab_cache_size=$VOCAB_CACHE_SIZE >server_$i.log 2>&1 &
 done

 export MS_ROLE=MS_WORKER
 rm -rf ${execute_path}/worker/
 mkdir ${execute_path}/worker/
 cd ${execute_path}/worker/ || exit
 python -s ${self_path}/../train_and_eval_parameter_server_standalone.py --device_target=$DEVICE_TARGET  \
       --epochs=$EPOCH_SIZE --data_path=$DATASET --parameter_server=1                                   \
       --vocab_cache_size=$VOCAB_CACHE_SIZE --dropout_flag=1 >worker.log 2>&1 &
--- a/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py
+++ b/model_zoo/official/recommend/wide_and_deep/src/wide_and_deep.py
@@ -238,6 +238,8 @@ class WideDeepModel(nn.Cell):
        elif parameter_server:
            cache_enable = self.vocab_cache_size > 0
            target = 'DEVICE' if cache_enable else 'CPU'
            if not cache_enable:
                sparse = True
            if is_auto_parallel and config.full_batch and cache_enable:
                self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target=target,
                                                               slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,
--- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py
+++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_cache_distribute.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """train_multinpu."""
 """train distribute on parameter server."""


 import os
@@ -23,6 +23,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMoni
 from mindspore.context import ParallelMode
 from mindspore.communication.management import get_rank, get_group_size, init
 from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple
 from mindspore.common import set_seed

 from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
 from src.callbacks import LossCallBack, EvalCallBack
@@ -32,18 +33,19 @@ from src.config import WideDeepConfig

 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


 def get_WideDeep_net(config):
 def get_wide_deep_net(config):
    """
    Get network of wide&deep model.
    """
    WideDeep_net = WideDeepModel(config)
    loss_net = NetWithLossClass(WideDeep_net, config)
    loss_net = VirtualDatasetCellTriple(loss_net)
    wide_deep_net = WideDeepModel(config)
    loss_net = NetWithLossClass(wide_deep_net, config)
    if cache_enable:
        loss_net = VirtualDatasetCellTriple(loss_net)
    train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
                              cache_enable=bool(config.vocab_cache_size > 0))
    eval_net = PredictWithSigmoid(WideDeep_net)
    eval_net = VirtualDatasetCellTriple(eval_net)
                              cache_enable=(config.vocab_cache_size > 0))
    eval_net = PredictWithSigmoid(wide_deep_net)
    if cache_enable:
        eval_net = VirtualDatasetCellTriple(eval_net)
    return train_net, eval_net


@@ -51,7 +53,6 @@ class ModelBuilder():
    """
    ModelBuilder
    """

    def __init__(self):
        pass

@@ -67,13 +68,14 @@ class ModelBuilder():
        return hooks

    def get_net(self, config):
        return get_WideDeep_net(config)
        return get_wide_deep_net(config)


 def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
@@ -83,6 +85,9 @@ def train_and_eval(config):
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    if cache_enable:
        config.full_batch = True
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
@@ -107,35 +112,46 @@ def train_and_eval(config):
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net,
                  metrics={"auc": auc_metric})
    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(
        model, ds_eval, auc_metric, config)
    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config, per_print_times=20)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
                                  keep_checkpoint_max=5, integrated_save=False)
    callback = LossCallBack(config=config)
    if cache_enable:
        ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
                                      keep_checkpoint_max=5, integrated_save=False)
    else:
        ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
    context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
    callback_list = [TimeMonitor(
        ds_train.get_dataset_size()), eval_callback, callback]
    callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True)
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
                                 config=ckptconfig)
    if cache_enable:
        context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train,
                callbacks=callback_list,
                dataset_sink_mode=bool(parameter_server and cache_enable))


 if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=wide_deep_config.device_target, save_graphs=True)
    context.set_context(variable_memory_max_size="24GB")
    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
    cache_enable = wide_deep_config.vocab_cache_size > 0
    if cache_enable and wide_deep_config.device_target != "GPU":
        context.set_context(variable_memory_max_size="24GB")
    context.set_context(enable_sparse=True)
    context.set_ps_context(enable_ps=True)
    init()
    context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank()))

    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
    if cache_enable:
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True)
    else:
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=get_group_size())

    train_and_eval(wide_deep_config)
--- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py
+++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_standalone.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """train_multinpu."""
 """train standalone on parameter server."""


 import os
 import sys
 from mindspore import Model, context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
 from mindspore.context import ParallelMode
 from mindspore.communication.management import get_rank, get_group_size, init
 from mindspore.common import set_seed

 from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
@@ -33,15 +31,15 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 context.set_context(enable_sparse=True)


 def get_WideDeep_net(config):
 def get_wide_deep_net(config):
    """
    Get network of wide&deep model.
    """
    WideDeep_net = WideDeepModel(config)
    loss_net = NetWithLossClass(WideDeep_net, config)
    wide_deep_net = WideDeepModel(config)
    loss_net = NetWithLossClass(wide_deep_net, config)
    train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server),
                              cache_enable=bool(config.vocab_cache_size > 0))
    eval_net = PredictWithSigmoid(WideDeep_net)
                              cache_enable=(config.vocab_cache_size > 0))
    eval_net = PredictWithSigmoid(wide_deep_net)
    return train_net, eval_net


@@ -64,7 +62,7 @@ class ModelBuilder():
        return hooks

    def get_net(self, config):
        return get_WideDeep_net(config)
        return get_wide_deep_net(config)


 def train_and_eval(config):
@@ -82,14 +80,12 @@ def train_and_eval(config):
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    cache_enable = bool(config.vocab_cache_size > 0)
    cache_enable = config.vocab_cache_size > 0
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                              batch_size=batch_size, rank_id=get_rank(),
                              rank_size=get_group_size(), data_type=dataset_type)
                              batch_size=batch_size, data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, rank_id=get_rank(),
                             rank_size=get_group_size(), data_type=dataset_type)
                             batch_size=batch_size, data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

@@ -102,15 +98,11 @@ def train_and_eval(config):
    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
                                 config=ckptconfig)
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]

    model.train(epochs, ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(parameter_server and cache_enable))
@@ -120,10 +112,7 @@ if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()

    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
    context.set_ps_context(enable_ps=True)
    init()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                      device_num=get_group_size())

    train_and_eval(wide_deep_config)