python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer nltk --pool_size 16 cd tokenized_corpus/ # build bpe codes cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes # build bpe dict "subword-nmt get-vocab -i tokenized.txt -o vocab_en.dict.bin" # apply bpe encoding python apply_bpe_encoding.py --codes ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/all.bpe.codes \ --src_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \ --output_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/bpe \ --vocab_path ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/vocab_en.dict.bin \ --processes 32 # build dataset news crawl python news_crawl.py --src_folder ./news_crawl \ --dict_folder ./news_crawl \ --existed_vocab ./tokenized_corpus/vocab_en.dict.bin \ --mask_ratio 0.5 \ --output_folder ./news_crawl/dataset/tf_small_pretrain \ --max_len 128 \ --processes 32 \ --ngram 2 # build dataset cnndm python cnn_dm.py --test_src ./cnndm_data_prophetnet/prophetnet_tokenized/test.src.txt --test_ref ./cnndm_data_prophetnet/prophetnet_tokenized/test.tgt.txt --existed_vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin --noise_prob 0.0 --output_folder ./cnndm_data_prophetnet/dataset_hugging_face_tokenized/ --max_len 512 # train bash run_gpu.sh --task t --device_num 1 --device_id 3 --config ./config/config.json # inference bash run_gpu.sh --task i \ --device_num 1 \ --device_id 3 \ --config ./config/test.json \ --output output \ --metric rouge \ --vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin # pytorch model structure NgramTransformerProphetModel( (encoder): TransformerEncoder( (embed_tokens): Embedding(30522, 512, padding_idx=0) (embed_positions): LearnedPositionalEmbedding(513, 512, padding_idx=0) (layers): ModuleList( (0): TransformerEncoderLayer( (self_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) (1): TransformerEncoderLayer( (self_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) (2): TransformerEncoderLayer( (self_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) ) (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) (decoder): NgramTransformerDecoder( (embed_tokens): Embedding(30522, 512, padding_idx=0) (embed_positions): LearnedPositionalEmbedding(514, 512, padding_idx=0) (ngram_input_embed): Embedding(2, 512) (layers): ModuleList( (0): NgramTransformerDecoderLayer( (ngram_self_attn): NgramMultiheadAttention( (relative_linear): Linear(in_features=512, out_features=256, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) (1): NgramTransformerDecoderLayer( (ngram_self_attn): NgramMultiheadAttention( (relative_linear): Linear(in_features=512, out_features=256, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) (2): NgramTransformerDecoderLayer( (ngram_self_attn): NgramMultiheadAttention( (relative_linear): Linear(in_features=512, out_features=256, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (k_proj): Linear(in_features=512, out_features=512, bias=True) (v_proj): Linear(in_features=512, out_features=512, bias=True) (q_proj): Linear(in_features=512, out_features=512, bias=True) (out_proj): Linear(in_features=512, out_features=512, bias=True) ) (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=512, out_features=2048, bias=True) (fc2): Linear(in_features=2048, out_features=512, bias=True) (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) ) (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) ) ) data example: src_tokens tensor([[ 1996, 11555, 18172, 7042, 2055, 1037, 18147, 5913, 3756, 6982, 1999, 1996, 4120, 1012, 2007, 1996, 4022, 2000, 2022, 3621, 2062, 4795, 1010, 2021, 2074, 2004, 26102, 1010, 1996, 7726, 3212, 2038, 2042, 27696, 1996, 6745, 2804, 2000, 2049, 4170, 1011, 1037, 8235, 4408, 28653, 2630, 6982, 1012, 11216, 1997, 1996, 27143, 1011, 2550, 21905, 2442, 2031, 2245, 2008, 1996, 13576, 8703, 2052, 2191, 1996, 7477, 12586, 1999, 2007, 1996, 2784, 5380, 1997, 1996, 2152, 11915, 1012, 17186, 2091, 2005, 2678, 1012, 3239, 1011, 9105, 1024, 7726, 3212, 9058, 2020, 4760, 2125, 2037, 4408, 28653, 12622, 2006, 2110, 2547, 1012, 18783, 1024, 7726, 3212, 3738, 3233, 2006, 2327, 1997, 1996, 8254, 2050, 1021, 6982, 2328, 27143, 1012, 2021, 2009, 1005, 1055, 2524, 2000, 2903, 2008, 1996, 4099, 2180, 1005, 1056, 2156, 2023, 2028, 2746, 2007, 1996, 6120, 2437, 2009, 3233, 2041, 2066, 1037, 14699, 7639, 2114, 1996, 2300, 1005, 1055, 3302, 1012, 1996, 3212, 2001, 4760, 2125, 1996, 3239, 1011, 9105, 4325, 1010, 2029, 2003, 2105, 1996, 2946, 1997, 1037, 15437, 1010, 2006, 4238, 2110, 2547, 7483, 1012, 3212, 4584, 1010, 2738, 4603, 2135, 5102, 1999, 5810, 2601, 11408, 4102, 2000, 2037, 28190, 2911, 1010, 3427, 2004, 1996, 8254, 2050, 1011, 1021, 1010, 6055, 2007, 3424, 1011, 2911, 10815, 1010, 2001, 3390, 2012, 24112, 2099, 17532, 1010, 2379, 1996, 6143, 11195, 1997, 7570, 10867, 17040, 1012, 2048, 2047, 7726, 1011, 2328, 1043, 16102, 4313, 4942, 2015, 1998, 2048, 13671, 25215, 11890, 27528, 2102, 2020, 2036, 5359, 2000, 1996, 3212, 1012, 8235, 2630, 1024, 4238, 1005, 1055, 4397, 3390, 1043, 16102, 4313, 6982, 5829, 1999, 2392, 1997, 1037, 4049, 1999, 1996, 2670, 3417, 1997, 24112, 2099, 17532, 1999, 1996, 4723, 6084, 1012, 19194, 1024, 1996, 12622, 3233, 2041, 2066, 1037, 14699, 1011, 7639, 2114, 1996, 3302, 1997, 1996, 2712, 1012, 3212, 2708, 4373, 5902, 5292, 28065, 14511, 4430, 2360, 13380, 2072, 2001, 9339, 2006, 7726, 2547, 2004, 3038, 2008, 1996, 3842, 2442, 10295, 1996, 1005, 14751, 2974, 1998, 2327, 1011, 3694, 4128, 2000, 4047, 2049, 6645, 1012, 1005, 1043, 16102, 4313, 2465, 12622, 2064, 2543, 10815, 1998, 18544, 2012, 1996, 2168, 2051, 1010, 1998, 2064, 5452, 1999, 1996, 4723, 6084, 1005, 1055, 8467, 5380, 1012, 4238, 2038, 4912, 2000, 12200, 2049, 2250, 3639, 1998, 3987, 9859, 1010, 3038, 2151, 2825, 2925, 4491, 2006, 2009, 2052, 2272, 2013, 1996, 2250, 1998, 2712, 1012, 1996, 2406, 2085, 4447, 2000, 2022, 1005, 2969, 7182, 1005, 1999, 3408, 1997, 17731, 3941, 2000, 3113, 2049, 2510, 3791, 1012, 14430, 1024, 1996, 7726, 6982, 1005, 1055, 2453, 2022, 2062, 9252, 2084, 1996, 11555, 1005, 21864, 15952, 3756, 6982, 1010, 15885, 1010, 2021, 2027, 2024, 8053, 14224, 11401, 1012, 102]], device='cuda:0') prev_output_tokens tensor([[ 102, 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719, 1011, 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102, 4313, 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528, 2102, 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442, 10295, 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645, 1012]], device='cuda:0') target_tokens: tensor([[ 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719, 1011, 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102, 4313, 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528, 2102, 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442, 10295, 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645, 1012, 102]], device='cuda:0')