You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

instructions 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. python tokenize_corpus.py --corpus_folder /{path}/corpus --output_folder /{path}/tokenized_corpus --tokenizer nltk --pool_size 16
  2. cd tokenized_corpus/
  3. # build bpe codes
  4. cat *.txt | subword-nmt learn-bpe -s 46000 -o all.bpe.codes
  5. # build bpe dict
  6. "subword-nmt get-vocab -i tokenized.txt -o vocab_en.dict.bin"
  7. # apply bpe encoding
  8. python apply_bpe_encoding.py --codes ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/all.bpe.codes \
  9. --src_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/ \
  10. --output_folder ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/bpe \
  11. --vocab_path ~/Mindspore/mindspore/model_zoo/official/nlp/mass/tokenized_corpus/vocab_en.dict.bin \
  12. --processes 32
  13. # build dataset news crawl
  14. python news_crawl.py --src_folder ./news_crawl \
  15. --dict_folder ./news_crawl \
  16. --existed_vocab ./tokenized_corpus/vocab_en.dict.bin \
  17. --mask_ratio 0.5 \
  18. --output_folder ./news_crawl/dataset/tf_small_pretrain \
  19. --max_len 128 \
  20. --processes 32 \
  21. --ngram 2
  22. # build dataset cnndm
  23. python cnn_dm.py --test_src ./cnndm_data_prophetnet/prophetnet_tokenized/test.src.txt --test_ref ./cnndm_data_prophetnet/prophetnet_tokenized/test.tgt.txt --existed_vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin --noise_prob 0.0 --output_folder ./cnndm_data_prophetnet/dataset_hugging_face_tokenized/ --max_len 512
  24. # train
  25. bash run_gpu.sh --task t --device_num 1 --device_id 3 --config ./config/config.json
  26. # inference
  27. bash run_gpu.sh --task i \
  28. --device_num 1 \
  29. --device_id 3 \
  30. --config ./config/test.json \
  31. --output output \
  32. --metric rouge \
  33. --vocab ./cnndm_data_prophetnet/cnndm_torch_prophetnet_30522.bin
  34. # pytorch model structure
  35. NgramTransformerProphetModel(
  36. (encoder): TransformerEncoder(
  37. (embed_tokens): Embedding(30522, 512, padding_idx=0)
  38. (embed_positions): LearnedPositionalEmbedding(513, 512, padding_idx=0)
  39. (layers): ModuleList(
  40. (0): TransformerEncoderLayer(
  41. (self_attn): MultiheadAttention(
  42. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  43. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  44. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  45. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  46. )
  47. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  48. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  49. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  50. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  51. )
  52. (1): TransformerEncoderLayer(
  53. (self_attn): MultiheadAttention(
  54. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  55. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  56. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  57. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  58. )
  59. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  60. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  61. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  62. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  63. )
  64. (2): TransformerEncoderLayer(
  65. (self_attn): MultiheadAttention(
  66. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  67. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  68. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  69. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  70. )
  71. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  72. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  73. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  74. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  75. )
  76. )
  77. (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  78. )
  79. (decoder): NgramTransformerDecoder(
  80. (embed_tokens): Embedding(30522, 512, padding_idx=0)
  81. (embed_positions): LearnedPositionalEmbedding(514, 512, padding_idx=0)
  82. (ngram_input_embed): Embedding(2, 512)
  83. (layers): ModuleList(
  84. (0): NgramTransformerDecoderLayer(
  85. (ngram_self_attn): NgramMultiheadAttention(
  86. (relative_linear): Linear(in_features=512, out_features=256, bias=True)
  87. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  88. )
  89. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  90. (encoder_attn): MultiheadAttention(
  91. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  92. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  93. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  94. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  95. )
  96. (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  97. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  98. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  99. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  100. )
  101. (1): NgramTransformerDecoderLayer(
  102. (ngram_self_attn): NgramMultiheadAttention(
  103. (relative_linear): Linear(in_features=512, out_features=256, bias=True)
  104. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  105. )
  106. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  107. (encoder_attn): MultiheadAttention(
  108. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  109. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  110. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  111. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  112. )
  113. (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  114. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  115. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  116. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  117. )
  118. (2): NgramTransformerDecoderLayer(
  119. (ngram_self_attn): NgramMultiheadAttention(
  120. (relative_linear): Linear(in_features=512, out_features=256, bias=True)
  121. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  122. )
  123. (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  124. (encoder_attn): MultiheadAttention(
  125. (k_proj): Linear(in_features=512, out_features=512, bias=True)
  126. (v_proj): Linear(in_features=512, out_features=512, bias=True)
  127. (q_proj): Linear(in_features=512, out_features=512, bias=True)
  128. (out_proj): Linear(in_features=512, out_features=512, bias=True)
  129. )
  130. (encoder_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  131. (fc1): Linear(in_features=512, out_features=2048, bias=True)
  132. (fc2): Linear(in_features=2048, out_features=512, bias=True)
  133. (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  134. )
  135. )
  136. (emb_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  137. )
  138. )
  139. data example:
  140. src_tokens
  141. tensor([[ 1996, 11555, 18172, 7042, 2055, 1037, 18147, 5913, 3756, 6982,
  142. 1999, 1996, 4120, 1012, 2007, 1996, 4022, 2000, 2022, 3621,
  143. 2062, 4795, 1010, 2021, 2074, 2004, 26102, 1010, 1996, 7726,
  144. 3212, 2038, 2042, 27696, 1996, 6745, 2804, 2000, 2049, 4170,
  145. 1011, 1037, 8235, 4408, 28653, 2630, 6982, 1012, 11216, 1997,
  146. 1996, 27143, 1011, 2550, 21905, 2442, 2031, 2245, 2008, 1996,
  147. 13576, 8703, 2052, 2191, 1996, 7477, 12586, 1999, 2007, 1996,
  148. 2784, 5380, 1997, 1996, 2152, 11915, 1012, 17186, 2091, 2005,
  149. 2678, 1012, 3239, 1011, 9105, 1024, 7726, 3212, 9058, 2020,
  150. 4760, 2125, 2037, 4408, 28653, 12622, 2006, 2110, 2547, 1012,
  151. 18783, 1024, 7726, 3212, 3738, 3233, 2006, 2327, 1997, 1996,
  152. 8254, 2050, 1021, 6982, 2328, 27143, 1012, 2021, 2009, 1005,
  153. 1055, 2524, 2000, 2903, 2008, 1996, 4099, 2180, 1005, 1056,
  154. 2156, 2023, 2028, 2746, 2007, 1996, 6120, 2437, 2009, 3233,
  155. 2041, 2066, 1037, 14699, 7639, 2114, 1996, 2300, 1005, 1055,
  156. 3302, 1012, 1996, 3212, 2001, 4760, 2125, 1996, 3239, 1011,
  157. 9105, 4325, 1010, 2029, 2003, 2105, 1996, 2946, 1997, 1037,
  158. 15437, 1010, 2006, 4238, 2110, 2547, 7483, 1012, 3212, 4584,
  159. 1010, 2738, 4603, 2135, 5102, 1999, 5810, 2601, 11408, 4102,
  160. 2000, 2037, 28190, 2911, 1010, 3427, 2004, 1996, 8254, 2050,
  161. 1011, 1021, 1010, 6055, 2007, 3424, 1011, 2911, 10815, 1010,
  162. 2001, 3390, 2012, 24112, 2099, 17532, 1010, 2379, 1996, 6143,
  163. 11195, 1997, 7570, 10867, 17040, 1012, 2048, 2047, 7726, 1011,
  164. 2328, 1043, 16102, 4313, 4942, 2015, 1998, 2048, 13671, 25215,
  165. 11890, 27528, 2102, 2020, 2036, 5359, 2000, 1996, 3212, 1012,
  166. 8235, 2630, 1024, 4238, 1005, 1055, 4397, 3390, 1043, 16102,
  167. 4313, 6982, 5829, 1999, 2392, 1997, 1037, 4049, 1999, 1996,
  168. 2670, 3417, 1997, 24112, 2099, 17532, 1999, 1996, 4723, 6084,
  169. 1012, 19194, 1024, 1996, 12622, 3233, 2041, 2066, 1037, 14699,
  170. 1011, 7639, 2114, 1996, 3302, 1997, 1996, 2712, 1012, 3212,
  171. 2708, 4373, 5902, 5292, 28065, 14511, 4430, 2360, 13380, 2072,
  172. 2001, 9339, 2006, 7726, 2547, 2004, 3038, 2008, 1996, 3842,
  173. 2442, 10295, 1996, 1005, 14751, 2974, 1998, 2327, 1011, 3694,
  174. 4128, 2000, 4047, 2049, 6645, 1012, 1005, 1043, 16102, 4313,
  175. 2465, 12622, 2064, 2543, 10815, 1998, 18544, 2012, 1996, 2168,
  176. 2051, 1010, 1998, 2064, 5452, 1999, 1996, 4723, 6084, 1005,
  177. 1055, 8467, 5380, 1012, 4238, 2038, 4912, 2000, 12200, 2049,
  178. 2250, 3639, 1998, 3987, 9859, 1010, 3038, 2151, 2825, 2925,
  179. 4491, 2006, 2009, 2052, 2272, 2013, 1996, 2250, 1998, 2712,
  180. 1012, 1996, 2406, 2085, 4447, 2000, 2022, 1005, 2969, 7182,
  181. 1005, 1999, 3408, 1997, 17731, 3941, 2000, 3113, 2049, 2510,
  182. 3791, 1012, 14430, 1024, 1996, 7726, 6982, 1005, 1055, 2453,
  183. 2022, 2062, 9252, 2084, 1996, 11555, 1005, 21864, 15952, 3756,
  184. 6982, 1010, 15885, 1010, 2021, 2027, 2024, 8053, 14224, 11401,
  185. 1012, 102]], device='cuda:0')
  186. prev_output_tokens
  187. tensor([[ 102, 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719,
  188. 1011, 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102,
  189. 4313, 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528,
  190. 2102, 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442,
  191. 10295, 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645,
  192. 1012]], device='cuda:0')
  193. target_tokens:
  194. tensor([[ 7726, 2110, 2547, 3662, 8333, 1997, 1996, 2047, 3719, 1011,
  195. 1037, 8254, 2050, 1021, 6982, 1010, 2048, 1043, 16102, 4313,
  196. 4942, 2015, 1998, 1037, 3940, 1997, 25215, 11890, 27528, 2102,
  197. 1012, 2, 3212, 4584, 2360, 2008, 1996, 4170, 2442, 10295,
  198. 1005, 1996, 14751, 2974, 1005, 2000, 4047, 2049, 6645, 1012,
  199. 102]], device='cuda:0')