You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_roberta_embedding.py 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. import unittest
  2. import torch
  3. import os
  4. from fastNLP import DataSet, Vocabulary
  5. from fastNLP.embeddings.roberta_embedding import RobertaWordPieceEncoder, RobertaEmbedding
  6. class TestRobertWordPieceEncoder(unittest.TestCase):
  7. @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
  8. def test_download(self):
  9. vocab = Vocabulary().add_word_lst("This is a test .".split())
  10. embed = RobertaEmbedding(vocab, model_dir_or_name='en')
  11. words = torch.LongTensor([[2, 3, 4, 0]])
  12. print(embed(words).size())
  13. for pool_method in ['first', 'last', 'max', 'avg']:
  14. for include_cls_sep in [True, False]:
  15. embed = RobertaEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method,
  16. include_cls_sep=include_cls_sep)
  17. print(embed(words).size())
  18. def test_robert_word_piece_encoder(self):
  19. # 可正常运行即可
  20. weight_path = 'test/data_for_tests/embedding/small_roberta'
  21. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1)
  22. ds = DataSet({'words': ["this is a test . [SEP]".split()]})
  23. encoder.index_datasets(ds, field_name='words')
  24. self.assertTrue(ds.has_field('word_pieces'))
  25. result = encoder(torch.LongTensor([[1,2,3,4]]))
  26. def test_roberta_embed_eq_roberta_piece_encoder(self):
  27. # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
  28. weight_path = 'test/data_for_tests/embedding/small_roberta'
  29. ds = DataSet({'words': ["this is a texta a sentence".split(), 'this is'.split()]})
  30. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
  31. encoder.eval()
  32. encoder.index_datasets(ds, field_name='words')
  33. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  34. word_pieces_res = encoder(word_pieces)
  35. vocab = Vocabulary()
  36. vocab.from_dataset(ds, field_name='words')
  37. vocab.index_dataset(ds, field_name='words', new_field_name='words')
  38. ds.set_input('words')
  39. words = torch.LongTensor(ds['words'].get([0, 1]))
  40. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path,
  41. pool_method='first', include_cls_sep=True, pooled_cls=False, min_freq=1)
  42. embed.eval()
  43. words_res = embed(words)
  44. # 检查word piece什么的是正常work的
  45. self.assertEqual((word_pieces_res[0, :5]-words_res[0, :5]).sum(), 0)
  46. self.assertEqual((word_pieces_res[0, 6:]-words_res[0, 5:]).sum(), 0)
  47. self.assertEqual((word_pieces_res[1, :3]-words_res[1, :3]).sum(), 0)
  48. @unittest.skipIf(True, "Only for local debugging")
  49. def test_eq_transformers(self):
  50. weight_path = ''
  51. ds = DataSet({'words': ["this is a texta model vocab".split(), 'this is'.split()]})
  52. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
  53. encoder.eval()
  54. encoder.index_datasets(ds, field_name='words')
  55. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  56. word_pieces_res = encoder(word_pieces)
  57. import transformers
  58. input1 = ' '.join(ds[0]['words'])
  59. input2 = ' '.join(ds[1]['words'])
  60. tokenizer = transformers.RobertaTokenizer.from_pretrained(weight_path)
  61. idx_list1 = tokenizer.encode(input1)
  62. idx_list2 = tokenizer.encode(input2)
  63. self.assertEqual(idx_list1, ds[0]['word_pieces'])
  64. self.assertEqual(idx_list2, ds[1]['word_pieces'])
  65. pad_value = tokenizer.encode('<pad>')[0]
  66. tensor = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(idx_list1),
  67. torch.LongTensor(idx_list2)],
  68. batch_first=True,
  69. padding_value=pad_value)
  70. roberta = transformers.RobertaModel.from_pretrained(weight_path, output_hidden_states=True)
  71. roberta.eval()
  72. output, pooled_output, hidden_states = roberta(tensor, attention_mask=tensor.ne(pad_value))
  73. self.assertEqual((output-word_pieces_res).sum(), 0)
  74. @unittest.skipIf(True, "Only for local usage")
  75. def test_generate_small_roberta(self):
  76. """
  77. 因为Roberta使用的是GPT2的tokenizer,所以没办法直接生成权重,需要用点下面的方式
  78. :return:
  79. """
  80. weight_path = ''
  81. from fastNLP.modules.tokenizer import RobertaTokenizer
  82. tokenizer = RobertaTokenizer.from_pretrained(weight_path)
  83. used_pairs = {}
  84. used_vocab = {}
  85. # 修改这里即可获得更多的sentence的数据
  86. sent1 = "This is a demo sentence"
  87. sent2 = "another demo"
  88. sent3 = 'this is a texta model vocab'
  89. all_tokens = []
  90. for sent in [sent1, sent2, sent3]:
  91. tokens = []
  92. for word in sent.split():
  93. word = ' '+ word
  94. token = "".join(
  95. tokenizer.byte_encoder[b] for b in word.encode("utf-8")
  96. )
  97. _token, _used_pairs = tokenizer.get_used_merge_pair_vocab(token)
  98. tokens.extend(_token.split())
  99. used_pairs.update(_used_pairs)
  100. all_tokens.extend(tokens)
  101. token_ids = tokenizer.convert_tokens_to_ids(tokens)
  102. used_vocab.update({t:i for t,i in zip(tokens, token_ids)})
  103. import json
  104. with open('test/data_for_tests/embedding/small_roberta/vocab.json', 'w') as f:
  105. new_used_vocab = {}
  106. for token in ['<s>', '<pad>', '</s>', '<unk>', '<mask>']: # <pad>必须为1
  107. new_used_vocab[token] = len(new_used_vocab)
  108. for i in range(65, 91):
  109. if chr(i) not in new_used_vocab:
  110. new_used_vocab[chr(i)] = len(new_used_vocab)
  111. for i in range(97, 123):
  112. if chr(i) not in new_used_vocab:
  113. new_used_vocab[chr(i)] = len(new_used_vocab)
  114. for idx, key in enumerate(used_vocab.keys()):
  115. if key not in new_used_vocab:
  116. new_used_vocab[key] = len(new_used_vocab)
  117. json.dump(new_used_vocab, f)
  118. with open('test/data_for_tests/embedding/small_roberta/merges.txt', 'w') as f:
  119. f.write('#version: tiny\n')
  120. for k,v in sorted(sorted(used_pairs.items(), key=lambda kv:kv[1])):
  121. f.write('{} {}\n'.format(k[0], k[1]))
  122. config = {
  123. "architectures": [
  124. "RobertaForMaskedLM"
  125. ],
  126. "attention_probs_dropout_prob": 0.1,
  127. "finetuning_task": None,
  128. "hidden_act": "gelu",
  129. "hidden_dropout_prob": 0.1,
  130. "hidden_size": 16,
  131. "initializer_range": 0.02,
  132. "intermediate_size": 20,
  133. "layer_norm_eps": 1e-05,
  134. "max_position_embeddings": 20,
  135. "num_attention_heads": 4,
  136. "num_hidden_layers": 2,
  137. "num_labels": 2,
  138. "output_attentions": False,
  139. "output_hidden_states": False,
  140. "torchscript": False,
  141. "type_vocab_size": 1,
  142. "vocab_size": len(new_used_vocab)
  143. }
  144. with open('test/data_for_tests/embedding/small_roberta/config.json', 'w') as f:
  145. json.dump(config, f)
  146. new_tokenizer = RobertaTokenizer.from_pretrained('test/data_for_tests/embedding/small_roberta')
  147. new_all_tokens = []
  148. for sent in [sent1, sent2, sent3]:
  149. tokens = new_tokenizer.tokenize(sent, add_prefix_space=True)
  150. new_all_tokens.extend(tokens)
  151. print(all_tokens, new_all_tokens)
  152. self.assertSequenceEqual(all_tokens, new_all_tokens)
  153. # 生成更小的merges.txt与vocab.json, 方法是通过记录tokenizer中的值实现
  154. from fastNLP.modules.encoder.roberta import RobertaModel, BertConfig
  155. config = BertConfig.from_json_file('test/data_for_tests/embedding/small_roberta/config.json')
  156. model = RobertaModel(config)
  157. torch.save(model.state_dict(), 'test/data_for_tests/embedding/small_roberta/small_pytorch_model.bin')
  158. print(model(torch.LongTensor([[0,1,2,3]])))
  159. def test_save_load(self):
  160. bert_save_test = 'roberta_save_test'
  161. try:
  162. os.makedirs(bert_save_test, exist_ok=True)
  163. embed = RobertaWordPieceEncoder(model_dir_or_name='test/data_for_tests/embedding/small_roberta', word_dropout=0.0,
  164. layers='-2')
  165. ds = DataSet({'words': ["this is a test . [SEP]".split()]})
  166. embed.index_datasets(ds, field_name='words')
  167. self.assertTrue(ds.has_field('word_pieces'))
  168. words = torch.LongTensor([[1, 2, 3, 4]])
  169. embed.save(bert_save_test)
  170. load_embed = RobertaWordPieceEncoder.load(bert_save_test)
  171. embed.eval(), load_embed.eval()
  172. self.assertEqual((embed(words) - load_embed(words)).sum(), 0)
  173. finally:
  174. import shutil
  175. shutil.rmtree(bert_save_test)
  176. class TestRobertaEmbedding(unittest.TestCase):
  177. def test_roberta_embedding_1(self):
  178. weight_path = 'test/data_for_tests/embedding/small_roberta'
  179. vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInRoberta".split())
  180. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1)
  181. requires_grad = embed.requires_grad
  182. embed.requires_grad = not requires_grad
  183. embed.train()
  184. words = torch.LongTensor([[2, 3, 4, 1]])
  185. result = embed(words)
  186. self.assertEqual(result.size(), (1, 4, 16))
  187. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  188. only_use_pretrain_bpe=True)
  189. embed.eval()
  190. words = torch.LongTensor([[2, 3, 4, 1]])
  191. result = embed(words)
  192. self.assertEqual(result.size(), (1, 4, 16))
  193. # 自动截断而不报错
  194. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  195. only_use_pretrain_bpe=True, auto_truncate=True)
  196. words = torch.LongTensor([[2, 3, 4, 1]*10,
  197. [2, 3]+[0]*38])
  198. result = embed(words)
  199. self.assertEqual(result.size(), (2, 40, 16))
  200. def test_roberta_ebembedding_2(self):
  201. # 测试only_use_pretrain_vocab与truncate_embed是否正常工作
  202. Embedding = RobertaEmbedding
  203. weight_path = 'test/data_for_tests/embedding/small_roberta'
  204. vocab = Vocabulary().add_word_lst("this is a texta and".split())
  205. embed1 = Embedding(vocab, model_dir_or_name=weight_path,layers=list(range(3)),
  206. only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1)
  207. # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS]
  208. # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab))
  209. embed2 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  210. only_use_pretrain_bpe=True, truncate_embed=False, min_freq=1)
  211. # embed_bpe_vocab_size = num_word # 排除NotInBERT
  212. # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab))
  213. embed3 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  214. only_use_pretrain_bpe=False, truncate_embed=True, min_freq=1)
  215. # embed_bpe_vocab_size = len(vocab)+2 # 新增##a, [CLS]
  216. # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab))
  217. embed4 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  218. only_use_pretrain_bpe=False, truncate_embed=False, min_freq=1)
  219. # embed_bpe_vocab_size = num_word+1 # 新增##a
  220. # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab))
  221. # 测试各种情况下以下tensor的值是相等的
  222. embed1.eval()
  223. embed2.eval()
  224. embed3.eval()
  225. embed4.eval()
  226. tensor = torch.LongTensor([[vocab.to_index(w) for w in 'this is a texta and'.split()]])
  227. t1 = embed1(tensor)
  228. t2 = embed2(tensor)
  229. t3 = embed3(tensor)
  230. t4 = embed4(tensor)
  231. self.assertEqual((t1-t2).sum(), 0)
  232. self.assertEqual((t1-t3).sum(), 0)
  233. self.assertEqual((t1-t4).sum(), 0)
  234. def test_save_load(self):
  235. bert_save_test = 'roberta_save_test'
  236. try:
  237. os.makedirs(bert_save_test, exist_ok=True)
  238. vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInBERT".split())
  239. embed = RobertaEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_roberta',
  240. word_dropout=0.1,
  241. auto_truncate=True)
  242. embed.save(bert_save_test)
  243. load_embed = RobertaEmbedding.load(bert_save_test)
  244. words = torch.randint(len(vocab), size=(2, 20))
  245. embed.eval(), load_embed.eval()
  246. self.assertEqual((embed(words) - load_embed(words)).sum(), 0)
  247. finally:
  248. import shutil
  249. shutil.rmtree(bert_save_test)