| @@ -0,0 +1,58 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$4 | |||
| export MS_SCHED_HOST=$5 | |||
| export MS_SCHED_PORT=$6 | |||
| export MS_ROLE=MS_SCHED | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| mpirun --allow-run-as-root -n $RANK_SIZE python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --device_target='GPU' --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker.log 2>&1 & | |||
| @@ -0,0 +1,68 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export RANK_SIZE=$1 | |||
| export EPOCH_SIZE=$2 | |||
| export DATASET=$3 | |||
| export RANK_TABLE_FILE=$4 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=$RANK_SIZE | |||
| export MS_SERVER_NUM=$5 | |||
| export MS_SCHED_HOST=$6 | |||
| export MS_SCHED_PORT=$7 | |||
| export MS_ROLE=MS_SCHED | |||
| for((i=0;i<1;i++)); | |||
| do | |||
| rm -rf ${execute_path}/sched_$i/ | |||
| mkdir ${execute_path}/sched_$i/ | |||
| cd ${execute_path}/sched_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >sched_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| for((i=0;i<$MS_WORKER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/worker_$i/ | |||
| mkdir ${execute_path}/worker_$i/ | |||
| cd ${execute_path}/worker_$i/ || exit | |||
| export RANK_ID=$i | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server_cache_distribute.py \ | |||
| --data_path=$DATASET --epochs=$EPOCH_SIZE --parameter_server=1 \ | |||
| --vocab_cache_size=300000 --full_batch=1 --dropout_flag=1 >worker_$i.log 2>&1 & | |||
| done | |||
| @@ -0,0 +1,56 @@ | |||
| #!/bin/bash | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| execute_path=$(pwd) | |||
| script_self=$(readlink -f "$0") | |||
| self_path=$(dirname "${script_self}") | |||
| export EPOCH_SIZE=$1 | |||
| export DEVICE_TARGET=$2 | |||
| export DATASET=$3 | |||
| export MS_COMM_TYPE=zmq | |||
| export MS_SCHED_NUM=1 | |||
| export MS_WORKER_NUM=1 | |||
| export MS_SERVER_NUM=$4 | |||
| export MS_SCHED_HOST=$5 | |||
| export MS_SCHED_PORT=$6 | |||
| export MS_ROLE=MS_SCHED | |||
| rm -rf ${execute_path}/sched/ | |||
| mkdir ${execute_path}/sched/ | |||
| cd ${execute_path}/sched/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 >sched.log 2>&1 & | |||
| export MS_ROLE=MS_PSERVER | |||
| for((i=0;i<$MS_SERVER_NUM;i++)); | |||
| do | |||
| rm -rf ${execute_path}/server_$i/ | |||
| mkdir ${execute_path}/server_$i/ | |||
| cd ${execute_path}/server_$i/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 >server_$i.log 2>&1 & | |||
| done | |||
| export MS_ROLE=MS_WORKER | |||
| rm -rf ${execute_path}/worker/ | |||
| mkdir ${execute_path}/worker/ | |||
| cd ${execute_path}/worker/ || exit | |||
| export DEVICE_ID=$i | |||
| python -s ${self_path}/../train_and_eval_parameter_server.py --epochs=$EPOCH_SIZE --device_target=$DEVICE_TARGET --data_path=$DATASET \ | |||
| --parameter_server=1 --vocab_cache_size=300000 \ | |||
| --dropout_flag=1 >worker.log 2>&1 & | |||
| @@ -0,0 +1,141 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| """train_multinpu.""" | |||
| import os | |||
| import sys | |||
| import mindspore.dataset.engine as de | |||
| from mindspore import Model, context | |||
| from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor | |||
| from mindspore.context import ParallelMode | |||
| from mindspore.communication.management import get_rank, get_group_size, init | |||
| from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple | |||
| from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel | |||
| from src.callbacks import LossCallBack, EvalCallBack | |||
| from src.datasets import create_dataset, DataType | |||
| from src.metrics import AUCMetric | |||
| from src.config import WideDeepConfig | |||
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |||
| def get_WideDeep_net(config): | |||
| """ | |||
| Get network of wide&deep model. | |||
| """ | |||
| WideDeep_net = WideDeepModel(config) | |||
| loss_net = NetWithLossClass(WideDeep_net, config) | |||
| loss_net = VirtualDatasetCellTriple(loss_net) | |||
| train_net = TrainStepWrap(loss_net, parameter_server=bool(config.parameter_server), | |||
| cache_enable=bool(config.vocab_cache_size > 0)) | |||
| eval_net = PredictWithSigmoid(WideDeep_net) | |||
| eval_net = VirtualDatasetCellTriple(eval_net) | |||
| return train_net, eval_net | |||
| class ModelBuilder(): | |||
| """ | |||
| ModelBuilder | |||
| """ | |||
| def __init__(self): | |||
| pass | |||
| def get_hook(self): | |||
| pass | |||
| def get_train_hook(self): | |||
| hooks = [] | |||
| callback = LossCallBack() | |||
| hooks.append(callback) | |||
| if int(os.getenv('DEVICE_ID')) == 0: | |||
| pass | |||
| return hooks | |||
| def get_net(self, config): | |||
| return get_WideDeep_net(config) | |||
| def train_and_eval(config): | |||
| """ | |||
| test_train_eval | |||
| """ | |||
| data_path = config.data_path | |||
| batch_size = config.batch_size | |||
| epochs = config.epochs | |||
| if config.dataset_type == "tfrecord": | |||
| dataset_type = DataType.TFRECORD | |||
| elif config.dataset_type == "mindrecord": | |||
| dataset_type = DataType.MINDRECORD | |||
| else: | |||
| dataset_type = DataType.H5 | |||
| print("epochs is {}".format(epochs)) | |||
| if config.full_batch: | |||
| context.set_auto_parallel_context(full_batch=True) | |||
| de.config.set_seed(1) | |||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | |||
| batch_size=batch_size*get_group_size(), data_type=dataset_type) | |||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | |||
| batch_size=batch_size*get_group_size(), data_type=dataset_type) | |||
| else: | |||
| ds_train = create_dataset(data_path, train_mode=True, epochs=1, | |||
| batch_size=batch_size, rank_id=get_rank(), | |||
| rank_size=get_group_size(), data_type=dataset_type) | |||
| ds_eval = create_dataset(data_path, train_mode=False, epochs=1, | |||
| batch_size=batch_size, rank_id=get_rank(), | |||
| rank_size=get_group_size(), data_type=dataset_type) | |||
| print("ds_train.size: {}".format(ds_train.get_dataset_size())) | |||
| print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) | |||
| net_builder = ModelBuilder() | |||
| train_net, eval_net = net_builder.get_net(config) | |||
| train_net.set_train() | |||
| auc_metric = AUCMetric() | |||
| model = Model(train_net, eval_network=eval_net, | |||
| metrics={"auc": auc_metric}) | |||
| eval_callback = EvalCallBack( | |||
| model, ds_eval, auc_metric, config) | |||
| callback = LossCallBack(config=config, per_print_times=20) | |||
| ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, | |||
| keep_checkpoint_max=5, integrated_save=False) | |||
| ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', | |||
| directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) | |||
| context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) | |||
| callback_list = [TimeMonitor( | |||
| ds_train.get_dataset_size()), eval_callback, callback] | |||
| callback_list.append(ckpoint_cb) | |||
| model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=True) | |||
| if __name__ == "__main__": | |||
| wide_deep_config = WideDeepConfig() | |||
| wide_deep_config.argparse_init() | |||
| context.set_context(mode=context.GRAPH_MODE, | |||
| device_target=wide_deep_config.device_target, save_graphs=True) | |||
| context.set_context(variable_memory_max_size="24GB") | |||
| context.set_context(enable_sparse=True) | |||
| context.set_ps_context(enable_ps=True) | |||
| init() | |||
| context.set_context(save_graphs_path='./graphs_of_device_id_'+str(get_rank())) | |||
| context.set_auto_parallel_context( | |||
| parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True) | |||
| train_and_eval(wide_deep_config) | |||