Huawei_Technology
/
mindspore

python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer nltk --pool_size 16
cd tokenized_corpus/

# build bpe codes
cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes

# build bpe dict
"subword-nmt get-vocab -i tokenized.txt -o vocab_en.dict.bin"

# apply bpe encoding
python apply_bpe_encoding.py --codes ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/all.bpe.codes \
    --src_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \
    --output_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/bpe \
    --vocab_path ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/vocab_en.dict.bin \
    --processes 32

# build dataset news crawl
python news_crawl.py --src_folder ./news_crawl \
    --dict_folder ./news_crawl \
    --existed_vocab ./tokenized_corpus/vocab_en.dict.bin \
    --mask_ratio 0.5 \
    --output_folder ./news_crawl/dataset/tf_small_pretrain \
    --max_len 128 \
    --processes 32 \
    --ngram 2

# build dataset cnndm
python cnn_dm.py --test_src ./cnndm_data_prophetnet/prophetnet_tokenized/test.src.txt  --test_ref ./cnndm_data_prophetnet/prophetnet_tokenized/test.tgt.txt  --existed_vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin  --noise_prob 0.0  --output_folder ./cnndm_data_prophetnet/dataset_hugging_face_tokenized/ --max_len 512


# train
bash run_gpu.sh --task t --device_num 1 --device_id 3 --config ./config/config.json

# inference
bash run_gpu.sh --task i \
                --device_num 1 \
                --device_id 3 \
                --config ./config/test.json \
                --output output \
                --metric rouge \
                --vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin
    
# pytorch model structure
NgramTransformerProphetModel(
  (encoder): TransformerEncoder(
    (embed_tokens): Embedding(30522, 512, padding_idx=0)
    (embed_positions): LearnedPositionalEmbedding(513, 512, padding_idx=0)
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): NgramTransformerDecoder(
    (embed_tokens): Embedding(30522, 512, padding_idx=0)
    (embed_positions): LearnedPositionalEmbedding(514, 512, padding_idx=0)
    (ngram_input_embed): Embedding(2, 512)
    (layers): ModuleList(
      (0): NgramTransformerDecoderLayer(
        (ngram_self_attn): NgramMultiheadAttention(
          (relative_linear): Linear(in_features=512, out_features=256, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (encoder_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): NgramTransformerDecoderLayer(
        (ngram_self_attn): NgramMultiheadAttention(
          (relative_linear): Linear(in_features=512, out_features=256, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (encoder_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): NgramTransformerDecoderLayer(
        (ngram_self_attn): NgramMultiheadAttention(
          (relative_linear): Linear(in_features=512, out_features=256, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (encoder_attn): MultiheadAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
)

data example:
src_tokens
tensor([[ 1996, 11555, 18172,  7042,  2055,  1037, 18147,  5913,  3756,  6982,
          1999,  1996,  4120,  1012,  2007,  1996,  4022,  2000,  2022,  3621,
          2062,  4795,  1010,  2021,  2074,  2004, 26102,  1010,  1996,  7726,
          3212,  2038,  2042, 27696,  1996,  6745,  2804,  2000,  2049,  4170,
          1011,  1037,  8235,  4408, 28653,  2630,  6982,  1012, 11216,  1997,
          1996, 27143,  1011,  2550, 21905,  2442,  2031,  2245,  2008,  1996,
         13576,  8703,  2052,  2191,  1996,  7477, 12586,  1999,  2007,  1996,
          2784,  5380,  1997,  1996,  2152, 11915,  1012, 17186,  2091,  2005,
          2678,  1012,  3239,  1011,  9105,  1024,  7726,  3212,  9058,  2020,
          4760,  2125,  2037,  4408, 28653, 12622,  2006,  2110,  2547,  1012,
         18783,  1024,  7726,  3212,  3738,  3233,  2006,  2327,  1997,  1996,
          8254,  2050,  1021,  6982,  2328, 27143,  1012,  2021,  2009,  1005,
          1055,  2524,  2000,  2903,  2008,  1996,  4099,  2180,  1005,  1056,
          2156,  2023,  2028,  2746,  2007,  1996,  6120,  2437,  2009,  3233,
          2041,  2066,  1037, 14699,  7639,  2114,  1996,  2300,  1005,  1055,
          3302,  1012,  1996,  3212,  2001,  4760,  2125,  1996,  3239,  1011,
          9105,  4325,  1010,  2029,  2003,  2105,  1996,  2946,  1997,  1037,
         15437,  1010,  2006,  4238,  2110,  2547,  7483,  1012,  3212,  4584,
          1010,  2738,  4603,  2135,  5102,  1999,  5810,  2601, 11408,  4102,
          2000,  2037, 28190,  2911,  1010,  3427,  2004,  1996,  8254,  2050,
          1011,  1021,  1010,  6055,  2007,  3424,  1011,  2911, 10815,  1010,
          2001,  3390,  2012, 24112,  2099, 17532,  1010,  2379,  1996,  6143,
         11195,  1997,  7570, 10867, 17040,  1012,  2048,  2047,  7726,  1011,
          2328,  1043, 16102,  4313,  4942,  2015,  1998,  2048, 13671, 25215,
         11890, 27528,  2102,  2020,  2036,  5359,  2000,  1996,  3212,  1012,
          8235,  2630,  1024,  4238,  1005,  1055,  4397,  3390,  1043, 16102,
          4313,  6982,  5829,  1999,  2392,  1997,  1037,  4049,  1999,  1996,
          2670,  3417,  1997, 24112,  2099, 17532,  1999,  1996,  4723,  6084,
          1012, 19194,  1024,  1996, 12622,  3233,  2041,  2066,  1037, 14699,
          1011,  7639,  2114,  1996,  3302,  1997,  1996,  2712,  1012,  3212,
          2708,  4373,  5902,  5292, 28065, 14511,  4430,  2360, 13380,  2072,
          2001,  9339,  2006,  7726,  2547,  2004,  3038,  2008,  1996,  3842,
          2442, 10295,  1996,  1005, 14751,  2974,  1998,  2327,  1011,  3694,
          4128,  2000,  4047,  2049,  6645,  1012,  1005,  1043, 16102,  4313,
          2465, 12622,  2064,  2543, 10815,  1998, 18544,  2012,  1996,  2168,
          2051,  1010,  1998,  2064,  5452,  1999,  1996,  4723,  6084,  1005,
          1055,  8467,  5380,  1012,  4238,  2038,  4912,  2000, 12200,  2049,
          2250,  3639,  1998,  3987,  9859,  1010,  3038,  2151,  2825,  2925,
          4491,  2006,  2009,  2052,  2272,  2013,  1996,  2250,  1998,  2712,
          1012,  1996,  2406,  2085,  4447,  2000,  2022,  1005,  2969,  7182,
          1005,  1999,  3408,  1997, 17731,  3941,  2000,  3113,  2049,  2510,
          3791,  1012, 14430,  1024,  1996,  7726,  6982,  1005,  1055,  2453,
          2022,  2062,  9252,  2084,  1996, 11555,  1005, 21864, 15952,  3756,
          6982,  1010, 15885,  1010,  2021,  2027,  2024,  8053, 14224, 11401,
          1012,   102]], device='cuda:0')
prev_output_tokens
tensor([[  102,  7726,  2110,  2547,  3662,  8333,  1997,  1996,  2047,  3719,
          1011,  1037,  8254,  2050,  1021,  6982,  1010,  2048,  1043, 16102,
          4313,  4942,  2015,  1998,  1037,  3940,  1997, 25215, 11890, 27528,
          2102,  1012,     2,  3212,  4584,  2360,  2008,  1996,  4170,  2442,
         10295,  1005,  1996, 14751,  2974,  1005,  2000,  4047,  2049,  6645,
          1012]], device='cuda:0')
target_tokens:
tensor([[ 7726,  2110,  2547,  3662,  8333,  1997,  1996,  2047,  3719,  1011,
          1037,  8254,  2050,  1021,  6982,  1010,  2048,  1043, 16102,  4313,
          4942,  2015,  1998,  1037,  3940,  1997, 25215, 11890, 27528,  2102,
          1012,     2,  3212,  4584,  2360,  2008,  1996,  4170,  2442, 10295,
          1005,  1996, 14751,  2974,  1005,  2000,  4047,  2049,  6645,  1012,
           102]], device='cuda:0')