|
|
|
@@ -0,0 +1,194 @@ |
|
|
|
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing) |
|
|
|
enable_modelarts: False |
|
|
|
# Url for modelarts |
|
|
|
data_url: "" |
|
|
|
train_url: "" |
|
|
|
checkpoint_url: "" |
|
|
|
# Path for local |
|
|
|
data_path: "/cache/data" |
|
|
|
output_path: "/cache/train" |
|
|
|
load_path: "/cache/checkpoint_path" |
|
|
|
device_target: "Ascend" |
|
|
|
enable_profiling: False |
|
|
|
|
|
|
|
# ============================================================================== |
|
|
|
description: 'run_pretrain' |
|
|
|
distribute: 'false' |
|
|
|
epoch_size: 40 |
|
|
|
device_id: 0 |
|
|
|
device_num: 1 |
|
|
|
enable_save_ckpt: 'true' |
|
|
|
enable_lossscale: 'false' |
|
|
|
do_shuffle: 'true' |
|
|
|
enable_data_sink: 'true' |
|
|
|
data_sink_steps: 100 |
|
|
|
accumulation_steps: 1 |
|
|
|
allreduce_post_accumulation: 'true' |
|
|
|
save_checkpoint_path: '' |
|
|
|
load_checkpoint_path: '' |
|
|
|
save_checkpoint_steps: 500 |
|
|
|
train_steps: 2500 |
|
|
|
save_checkpoint_num: 5 |
|
|
|
data_dir: '' |
|
|
|
schema_dir: '' |
|
|
|
|
|
|
|
# ============================================================================== |
|
|
|
# pretrain related |
|
|
|
batch_size: 20 |
|
|
|
# Available: [base, nezha, large, large_acc] |
|
|
|
bert_network: 'large_acc' |
|
|
|
loss_scale_value: 65536 |
|
|
|
scale_factor: 2 |
|
|
|
scale_window: 1000 |
|
|
|
optimizer: 'Thor' |
|
|
|
enable_global_norm: False |
|
|
|
# pretrain_eval related |
|
|
|
data_file: "" |
|
|
|
schema_file: null |
|
|
|
finetune_ckpt: "" |
|
|
|
# optimizer related |
|
|
|
AdamWeightDecay: |
|
|
|
learning_rate: 0.00003 # 3e-5 |
|
|
|
end_learning_rate: 0.0 |
|
|
|
power: 5.0 |
|
|
|
weight_decay: 0.00001 # 1e-5 |
|
|
|
decay_filter: ['layernorm', 'bias'] |
|
|
|
eps: 0.000001 # 1e-6 |
|
|
|
warmup_steps: 10000 |
|
|
|
|
|
|
|
Lamb: |
|
|
|
learning_rate: 0.0003 # 3e-4 |
|
|
|
end_learning_rate: 0.0 |
|
|
|
power: 2.0 |
|
|
|
warmup_steps: 10000 |
|
|
|
weight_decay: 0.01 |
|
|
|
decay_filter: ['layernorm', 'bias'] |
|
|
|
eps: 0.00000001 # 1e-8, |
|
|
|
|
|
|
|
Momentum: |
|
|
|
learning_rate: 0.00002 # 2e-5 |
|
|
|
momentum: 0.9 |
|
|
|
|
|
|
|
Thor: |
|
|
|
lr_max: 0.006464 |
|
|
|
lr_min: 0.000001 # 1e-6 |
|
|
|
lr_power: 2.0 |
|
|
|
lr_total_steps: 30000 |
|
|
|
damping_max: 0.007035 |
|
|
|
damping_min: 0.000001 # 1e-6 |
|
|
|
damping_power: 4.0 |
|
|
|
damping_total_steps: 30000 |
|
|
|
momentum: 0.9 |
|
|
|
weight_decay: 0.00001 # 1e-5 |
|
|
|
loss_scale: 1024.0 |
|
|
|
frequency: 100 |
|
|
|
# ============================================================================== |
|
|
|
# base |
|
|
|
base_batch_size: 256 |
|
|
|
base_net_cfg: |
|
|
|
seq_length: 128 |
|
|
|
vocab_size: 21128 |
|
|
|
hidden_size: 768 |
|
|
|
num_hidden_layers: 12 |
|
|
|
num_attention_heads: 12 |
|
|
|
intermediate_size: 3072 |
|
|
|
hidden_act: "gelu" |
|
|
|
hidden_dropout_prob: 0.1 |
|
|
|
attention_probs_dropout_prob: 0.1 |
|
|
|
max_position_embeddings: 512 |
|
|
|
type_vocab_size: 2 |
|
|
|
initializer_range: 0.02 |
|
|
|
use_relative_positions: False |
|
|
|
dtype: mstype.float32 |
|
|
|
compute_type: mstype.float16 |
|
|
|
# nezha |
|
|
|
nezha_batch_size: 96 |
|
|
|
nezha_net_cfg: |
|
|
|
seq_length: 128 |
|
|
|
vocab_size: 21128 |
|
|
|
hidden_size: 1024 |
|
|
|
num_hidden_layers: 24 |
|
|
|
num_attention_heads: 16 |
|
|
|
intermediate_size: 4096 |
|
|
|
hidden_act: "gelu" |
|
|
|
hidden_dropout_prob: 0.1 |
|
|
|
attention_probs_dropout_prob: 0.1 |
|
|
|
max_position_embeddings: 512 |
|
|
|
type_vocab_size: 2 |
|
|
|
initializer_range: 0.02 |
|
|
|
use_relative_positions: True |
|
|
|
dtype: mstype.float32 |
|
|
|
compute_type: mstype.float16 |
|
|
|
# large |
|
|
|
large_batch_size: 20 |
|
|
|
large_net_cfg: |
|
|
|
seq_length: 512 |
|
|
|
vocab_size: 30522 |
|
|
|
hidden_size: 1024 |
|
|
|
num_hidden_layers: 24 |
|
|
|
num_attention_heads: 16 |
|
|
|
intermediate_size: 4096 |
|
|
|
hidden_act: "gelu" |
|
|
|
hidden_dropout_prob: 0.1 |
|
|
|
attention_probs_dropout_prob: 0.1 |
|
|
|
max_position_embeddings: 512 |
|
|
|
type_vocab_size: 2 |
|
|
|
initializer_range: 0.02 |
|
|
|
use_relative_positions: False |
|
|
|
dtype: mstype.float32 |
|
|
|
compute_type: mstype.float16 |
|
|
|
# Accelerated large network which is only supported in Ascend yet. |
|
|
|
large_acc_batch_size: 20 |
|
|
|
large_acc_net_cfg: |
|
|
|
seq_length: 512 |
|
|
|
vocab_size: 30522 |
|
|
|
hidden_size: 1024 |
|
|
|
num_hidden_layers: 24 |
|
|
|
num_attention_heads: 16 |
|
|
|
intermediate_size: 4096 |
|
|
|
hidden_act: "fast_gelu" |
|
|
|
hidden_dropout_prob: 0.1 |
|
|
|
attention_probs_dropout_prob: 0.1 |
|
|
|
max_position_embeddings: 512 |
|
|
|
type_vocab_size: 2 |
|
|
|
initializer_range: 0.02 |
|
|
|
use_relative_positions: False |
|
|
|
dtype: mstype.float32 |
|
|
|
compute_type: mstype.float16 |
|
|
|
|
|
|
|
|
|
|
|
--- |
|
|
|
# Help description for each configuration |
|
|
|
enable_modelarts: "Whether training on modelarts, default: False" |
|
|
|
data_url: "Url for modelarts" |
|
|
|
train_url: "Url for modelarts" |
|
|
|
data_path: "The location of the input data." |
|
|
|
output_path: "The location of the output file." |
|
|
|
device_target: "Running platform, choose from Ascend or CPU, and default is Ascend." |
|
|
|
enable_profiling: 'Whether enable profiling while training, default: False' |
|
|
|
|
|
|
|
distribute: "Run distribute, default is 'false'." |
|
|
|
epoch_size: "Epoch size, default is 1." |
|
|
|
enable_save_ckpt: "Enable save checkpoint, default is true." |
|
|
|
enable_lossscale: "Use lossscale or not, default is not." |
|
|
|
do_shuffle: "Enable shuffle for dataset, default is true." |
|
|
|
enable_data_sink: "Enable data sink, default is true." |
|
|
|
data_sink_steps: "Sink steps for each epoch, default is 1." |
|
|
|
accumulation_steps: "Accumulating gradients N times before weight update, default is 1." |
|
|
|
allreduce_post_accumulation: "Whether to allreduce after accumulation of N steps or after each step, default is true." |
|
|
|
save_checkpoint_path: "Save checkpoint path" |
|
|
|
load_checkpoint_path: "Load checkpoint file path" |
|
|
|
save_checkpoint_steps: "Save checkpoint steps, default is 1000" |
|
|
|
train_steps: "Training Steps, default is -1, meaning run all steps according to epoch number." |
|
|
|
save_checkpoint_num: "Save checkpoint numbers, default is 1." |
|
|
|
data_dir: "Data path, it is better to use absolute path" |
|
|
|
schema_dir: "Schema path, it is better to use absolute path" |
|
|
|
--- |
|
|
|
# chocies |
|
|
|
device_target: ['Ascend', 'GPU'] |
|
|
|
distribute: ["true", "false"] |
|
|
|
enable_save_ckpt: ["true", "false"] |
|
|
|
enable_lossscale: ["true", "false"] |
|
|
|
do_shuffle: ["true", "false"] |
|
|
|
enable_data_sink: ["true", "false"] |
|
|
|
allreduce_post_accumulation: ["true", "false"] |