Browse Source

support startup bert_thor script with relative path

tags/v1.0.0
wangmin 5 years ago
parent
commit
d63b1f16bd
5 changed files with 11 additions and 5 deletions
  1. +2
    -2
      model_zoo/official/nlp/bert_thor/README.md
  2. +0
    -2
      model_zoo/official/nlp/bert_thor/pretrain_eval.py
  3. +4
    -0
      model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh
  4. +4
    -0
      model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh
  5. +1
    -1
      model_zoo/official/nlp/bert_thor/src/lr_generator.py

+ 2
- 2
model_zoo/official/nlp/bert_thor/README.md View File

@@ -128,12 +128,12 @@ Parameters for both training and inference can be set in config.py.
```
sh run_distribute_pretrain.sh [DEVICE_NUM] [EPOCH_SIZE] [DATA_DIR] [SCHEMA_DIR] [RANK_TABLE_FILE]
```
We need three parameters for this scripts.
We need five parameters for this scripts.
- `DEVICE_NUM`: the device number for distributed train.
- `EPOCH_SIZE`: Epoch size used in the model
- `DATA_DIR`:Data path, it is better to use absolute path.
- `SCHEMA_DIR `:Schema path, it is better to use absolute path
- `RANK_TABLE_FILE`: the path of rank_table.json
- `RANK_TABLE_FILE`: rank table file with JSON format
Training result will be stored in the current path, whose folder name begins with the file name that the user defines. Under this, you can find checkpoint file together with result like the followings in log.
```


+ 0
- 2
model_zoo/official/nlp/bert_thor/pretrain_eval.py View File

@@ -153,10 +153,8 @@ def MLM_eval():
net = Model(net_for_pretraining, eval_network=net_for_pretraining, eval_indexes=[0, 1, 2],
metrics={'name': myMetric()})
res = net.eval(dataset, dataset_sink_mode=False)
print("==============================================================")
for _, v in res.items():
print("Accuracy is: ", v)
print("==============================================================")


if __name__ == "__main__":


+ 4
- 0
model_zoo/official/nlp/bert_thor/scripts/run_distribute_pretrain.sh View File

@@ -25,6 +25,9 @@ EPOCH_SIZE=$2
DATA_DIR=$3
SCHEMA_DIR=$4

BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
cd $BASE_PATH/ || exit

ulimit -u unlimited
export RANK_TABLE_FILE=$5
export RANK_SIZE=$1
@@ -55,6 +58,7 @@ do
--load_checkpoint_path="" \
--save_checkpoint_path='./' \
--save_checkpoint_steps=1000 \
--train_steps=3000 \
--save_checkpoint_num=30 \
--data_dir=$DATA_DIR \
--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &


+ 4
- 0
model_zoo/official/nlp/bert_thor/scripts/run_standalone_pretrain.sh View File

@@ -24,6 +24,9 @@ EPOCH_SIZE=$2
DATA_DIR=$3
SCHEMA_DIR=$4

BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
cd $BASE_PATH/ || exit

ulimit -u unlimited
export DEVICE_ID=$1
export RANK_SIZE=1
@@ -51,6 +54,7 @@ python run_pretrain.py \
--load_checkpoint_path="" \
--save_checkpoint_path='./' \
--save_checkpoint_steps=5000 \
--train_steps=-1 \
--save_checkpoint_num=20 \
--data_dir=$DATA_DIR \
--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &


+ 1
- 1
model_zoo/official/nlp/bert_thor/src/lr_generator.py View File

@@ -55,7 +55,7 @@ def get_poly_lr(global_step, lr_init, lr_end, lr_max, warmup_steps, total_steps,
return learning_rate


# bert kfac hyperparam setting
# bert thor hyperparam setting
def get_bert_lr():
learning_rate = Tensor(
get_poly_lr(global_step=0, lr_init=0.0, lr_end=1e-6, lr_max=3.1e-3, warmup_steps=0, total_steps=30000,


Loading…
Cancel
Save